From 28a48de72b876af794853593cc1412119ada9efc Mon Sep 17 00:00:00 2001
From: "David A. Marlin" <dmarlin@redhat.com>
Date: Mon, 17 Jan 2005 18:29:21 +0000
Subject: [MTD] NAND extended commands, badb block table autorefresh

Added extended commands for AG-AND device and added
option for BBT_AUTO_REFRESH.

Signed-off-by: David A. Marlin <dmarlin@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/nand.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9a19c65abd74..0118128ae384 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -5,7 +5,7 @@
  *                     Steven J. Hill <sjhill@realitydiluted.com>
  *		       Thomas Gleixner <tglx@linutronix.de>
  *
- * $Id: nand.h,v 1.68 2004/11/12 10:40:37 gleixner Exp $
+ * $Id: nand.h,v 1.69 2005/01/17 18:29:18 dmarlin Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -48,6 +48,8 @@
  *  02-08-2004 tglx 	added option field to nand structure for chip anomalities
  *  05-25-2004 tglx 	added bad block table support, ST-MICRO manufacturer id
  *			update of nand_chip structure description
+ *  01-17-2005 dmarlin	added extended commands for AG-AND device and added option 
+ * 			for BBT_AUTO_REFRESH.
  */
 #ifndef __LINUX_MTD_NAND_H
 #define __LINUX_MTD_NAND_H
@@ -115,6 +117,25 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
 #define NAND_CMD_READSTART	0x30
 #define NAND_CMD_CACHEDPROG	0x15
 
+/* Extended commands for AG-AND device */
+/* 
+ * Note: the command for NAND_CMD_DEPLETE1 is really 0x00 but 
+ *       there is no way to distinguish that from NAND_CMD_READ0
+ *       until the remaining sequence of commands has been completed
+ *       so add a high order bit and mask it off in the command.
+ */
+#define NAND_CMD_DEPLETE1	0x100
+#define NAND_CMD_DEPLETE2	0x38
+#define NAND_CMD_STATUS_MULTI	0x71
+#define NAND_CMD_STATUS_ERROR	0x72
+/* multi-bank error status (banks 0-3) */
+#define NAND_CMD_STATUS_ERROR0	0x73
+#define NAND_CMD_STATUS_ERROR1	0x74
+#define NAND_CMD_STATUS_ERROR2	0x75
+#define NAND_CMD_STATUS_ERROR3	0x76
+#define NAND_CMD_STATUS_RESET	0x7f
+#define NAND_CMD_STATUS_CLEAR	0xff
+
 /* Status bits */
 #define NAND_STATUS_FAIL	0x01
 #define NAND_STATUS_FAIL_N1	0x02
@@ -170,6 +191,10 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
 /* Chip has a array of 4 pages which can be read without
  * additional ready /busy waits */
 #define NAND_4PAGE_ARRAY	0x00000040 
+/* Chip requires that BBT is periodically rewritten to prevent
+ * bits from adjacent blocks from 'leaking' in altering data.
+ * This happens with the Renesas AG-AND chips, possibly others.  */
+#define BBT_AUTO_REFRESH	0x00000080
 
 /* Options valid for Samsung large page devices */
 #define NAND_SAMSUNG_LP_OPTIONS \
-- 
cgit v1.2.3-59-g8ed1b


From 99f2a8aea18c9779c141050c6f95a8f1da63bbe4 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben@simtec.co.uk>
Date: Mon, 24 Jan 2005 00:37:04 +0000
Subject: [MTD] Platform RAM Driver

Driver for generic RAM blocks which are exported by an platform_device
from the device driver system.

Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/maps/Kconfig     |  12 +-
 drivers/mtd/maps/Makefile    |   3 +-
 drivers/mtd/maps/plat-ram.c  | 286 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/plat-ram.h |  35 ++++++
 4 files changed, 334 insertions(+), 2 deletions(-)
 create mode 100644 drivers/mtd/maps/plat-ram.c
 create mode 100644 include/linux/mtd/plat-ram.h

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 8480057eadb4..7d21d432f380 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -1,5 +1,5 @@
 # drivers/mtd/maps/Kconfig
-# $Id: Kconfig,v 1.42 2005/01/05 16:59:50 dwmw2 Exp $
+# $Id: Kconfig,v 1.43 2005/01/24 00:35:21 bjd Exp $
 
 menu "Mapping drivers for chip access"
 	depends on MTD!=n
@@ -659,5 +659,15 @@ config MTD_SHARP_SL
 	help
 	  This enables access to the flash chip on the Sharp SL Series of PDAs.
 
+config MTD_PLATRAM
+	tristate "Map driver for platfrom device RAM (mtd-ram)"
+	depends on MTD
+	select MTD_RAM
+	help
+	  Map driver for RAM areas described via the platform device
+	  system.
+
+	  This selection automatically selects the map_ram driver.
+
 endmenu
 
diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile
index 7ffe02b85301..d2e6dcc87059 100644
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -1,7 +1,7 @@
 #
 # linux/drivers/maps/Makefile
 #
-# $Id: Makefile.common,v 1.23 2005/01/05 17:06:36 dwmw2 Exp $
+# $Id: Makefile.common,v 1.24 2005/01/24 00:35:21 bjd Exp $
 
 ifeq ($(CONFIG_MTD_COMPLEX_MAPPINGS),y)
 obj-$(CONFIG_MTD)		+= map_funcs.o
@@ -71,3 +71,4 @@ obj-$(CONFIG_MTD_IXP2000)	+= ixp2000.o
 obj-$(CONFIG_MTD_WRSBC8260)	+= wr_sbc82xx_flash.o
 obj-$(CONFIG_MTD_DMV182)	+= dmv182.o
 obj-$(CONFIG_MTD_SHARP_SL)	+= sharpsl-flash.o
+obj-$(CONFIG_MTD_PLATRAM)	+= plat-ram.o
diff --git a/drivers/mtd/maps/plat-ram.c b/drivers/mtd/maps/plat-ram.c
new file mode 100644
index 000000000000..808f94346add
--- /dev/null
+++ b/drivers/mtd/maps/plat-ram.c
@@ -0,0 +1,286 @@
+/* drivers/mtd/maps/plat-ram.c
+ *
+ * (c) 2004-2005 Simtec Electronics
+ *	http://www.simtec.co.uk/products/SWLINUX/
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * Generic platfrom device based RAM map
+ *
+ * $Id: plat-ram.c,v 1.1 2005/01/24 00:37:02 bjd Exp $
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#define DEBUG
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ioport.h>
+#include <linux/device.h>
+
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/plat-ram.h>
+
+#include <asm/io.h>
+
+/* private structure for each mtd platform ram device created */
+
+struct platram_info {
+	struct device		*dev;
+	struct mtd_info		*mtd;
+	struct map_info		 map;
+	struct mtd_partition	*partitions;
+	struct resource		*area;
+	struct platdata_mtd_ram	*pdata;
+};
+
+/* to_platram_info()
+ *
+ * device private data to struct platram_info conversion
+*/
+
+static inline struct platram_info *to_platram_info(struct device *dev)
+{
+	return (struct platram_info *)dev_get_drvdata(dev);
+}
+
+/* platram_setrw
+ *
+ * call the platform device's set rw/ro control
+ *
+ * to = 0 => read-only
+ *    = 1 => read-write
+*/
+
+static inline void platram_setrw(struct platram_info *info, int to)
+{
+	if (info->pdata == NULL)
+		return;
+
+	if (info->pdata->set_rw != NULL)
+		(info->pdata->set_rw)(info->dev, to);
+}
+
+/* platram_remove
+ *
+ * called to remove the device from the driver's control
+*/
+
+static int platram_remove(struct device *dev)
+{
+	struct platram_info *info = to_platram_info(dev);
+
+	dev_set_drvdata(dev, NULL);
+
+	dev_dbg(dev, "removing device\n");
+
+	if (info == NULL) 
+		return 0;
+
+	if (info->mtd) {
+#ifdef CONFIG_MTD_PARTITIONS
+		if (info->partitions) {
+			del_mtd_partitions(info->mtd);
+			kfree(info->partitions);
+		}
+#endif
+		del_mtd_device(info->mtd);
+		map_destroy(info->mtd);
+	}
+
+	/* ensure ram is left read-only */
+
+	platram_setrw(info, PLATRAM_RO);
+
+	/* release resources */
+
+	if (info->area) {
+		release_resource(info->area);
+		kfree(info->area);
+	}
+
+	if (info->map.virt != NULL)
+		iounmap(info->map.virt);
+	
+	kfree(info);
+
+	return 0;
+}
+
+/* platram_probe
+ *
+ * called from device drive system when a device matching our
+ * driver is found.
+*/
+
+static int platram_probe(struct device *dev)
+{
+	struct platform_device *pd = to_platform_device(dev);
+	struct platdata_mtd_ram	*pdata;
+	struct platram_info *info;
+	struct resource *res;
+	int err = 0;
+
+	dev_dbg(dev, "probe entered\n");
+	
+	if (dev->platform_data == NULL) {
+		dev_err(dev, "no platform data supplied\n");
+		err = -ENOENT;
+		goto exit_error;
+	}
+
+	pdata = dev->platform_data;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (info == NULL) {
+		dev_err(dev, "no memory for flash info\n");
+		err = -ENOMEM;
+		goto exit_error;
+	}
+
+	memzero(info, sizeof(*info));
+	dev_set_drvdata(dev, info);
+
+	info->dev = dev;
+	info->pdata = pdata;
+
+	/* get the resource for the memory mapping */
+
+	res = platform_get_resource(pd, IORESOURCE_MEM, 0);
+
+	if (res == NULL) {
+		dev_err(dev, "no memory resource specified\n");
+		err = -ENOENT;
+		goto exit_free;
+	}
+
+	dev_dbg(dev, "got platform resource %p (0x%lx)\n", res, res->start);
+
+	/* setup map parameters */
+
+	info->map.phys = res->start;
+	info->map.size = (res->end - res->start) + 1;
+	info->map.name = pdata->mapname != NULL ? pdata->mapname : pd->name;
+	info->map.bankwidth = pdata->bankwidth;
+
+	/* register our usage of the memory area */
+
+	info->area = request_mem_region(res->start, info->map.size, pd->name);
+	if (info->area == NULL) {
+		dev_err(dev, "failed to request memory region\n");
+		err = -EIO;
+		goto exit_free;
+	}
+
+	/* remap the memory area */
+
+	info->map.virt = ioremap(res->start, info->map.size);
+	dev_dbg(dev, "virt %p, %d bytes\n", info->map.virt, info->map.size);
+
+	if (info->map.virt == NULL) {
+		dev_err(dev, "failed to ioremap() region\n");
+		err = -EIO;
+		goto exit_free;
+	}
+
+	{
+		unsigned int *p = (unsigned int *)info->map.virt;
+		printk("%08x %08x %08x %08x\n",
+		       readl(p), readl(p+1), readl(p+2), readl(p+3));
+	}
+
+	simple_map_init(&info->map);
+
+	dev_dbg(dev, "initialised map, probing for mtd\n");
+
+	/* probe for the right mtd map driver */
+
+	info->mtd = do_map_probe("map_ram" , &info->map);
+	if (info->mtd == NULL) {
+		dev_err(dev, "failed to probe for map_ram\n");
+		err = -ENOMEM;
+		goto exit_free;
+	}
+
+	info->mtd->owner = THIS_MODULE;
+
+	platram_setrw(info, PLATRAM_RW);
+
+	/* check to see if there are any available partitions, or wether
+	 * to add this device whole */
+
+#ifdef CONFIG_MTD_PARTITIONS
+	if (pdata->nr_partitions > 0) {
+		const char **probes = { NULL };
+
+		if (pdata->probes)
+			probes = (const char **)pdata->probes;
+
+		err = parse_mtd_partitions(info->mtd, probes,
+					   &info->partitions, 0);
+		if (err > 0) {
+			err = add_mtd_partitions(info->mtd, info->partitions,
+						 err);
+		}
+	}
+#endif /* CONFIG_MTD_PARTITIONS */
+
+	if (add_mtd_device(info->mtd)) {
+		dev_err(dev, "add_mtd_device() failed\n");
+		err = -ENOMEM;
+	}
+	
+	dev_info(dev, "registered mtd device\n");
+	return err;
+
+ exit_free:
+	platram_remove(dev);
+ exit_error:
+	return err;
+}
+
+/* device driver info */
+
+static struct device_driver platram_driver = {
+	.name		= "mtd-ram",
+	.bus		= &platform_bus_type,
+	.probe		= platram_probe,
+	.remove		= platram_remove,
+};
+
+/* module init/exit */
+
+static int __init platram_init(void)
+{
+	printk("Generic platform RAM MTD, (c) 2004 Simtec Electronics\n");
+	return driver_register(&platram_driver);
+}
+
+static void __exit platram_exit(void)
+{
+	driver_unregister(&platram_driver);
+}
+
+module_init(platram_init);
+module_exit(platram_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ben Dooks <ben@simtec.co.uk>");
+MODULE_DESCRIPTION("MTD platform RAM map driver");
diff --git a/include/linux/mtd/plat-ram.h b/include/linux/mtd/plat-ram.h
new file mode 100644
index 000000000000..2332eda07e0e
--- /dev/null
+++ b/include/linux/mtd/plat-ram.h
@@ -0,0 +1,35 @@
+/* linux/include/mtd/plat-ram.h
+ *
+ * (c) 2004 Simtec Electronics
+ *	http://www.simtec.co.uk/products/SWLINUX/
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * Generic platform device based RAM map
+ *
+ * $Id: plat-ram.h,v 1.2 2005/01/24 00:37:40 bjd Exp $
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#ifndef __LINUX_MTD_PLATRAM_H
+#define __LINUX_MTD_PLATRAM_H __FILE__
+
+#define PLATRAM_RO (0)
+#define PLATRAM_RW (1)
+
+struct platdata_mtd_ram {
+	char			*mapname;
+	char		       **probes;
+	struct mtd_partition	*partitions;
+	int			 nr_partitions;
+	int			 bankwidth;
+
+	/* control callbacks */
+
+	void	(*set_rw)(struct device *dev, int to);
+};
+
+#endif /* __LINUX_MTD_PLATRAM_H */
-- 
cgit v1.2.3-59-g8ed1b


From 068e3c0a002c79a5e3cc7c42cb749c4bb126288c Mon Sep 17 00:00:00 2001
From: "David A. Marlin" <dmarlin@redhat.com>
Date: Mon, 24 Jan 2005 03:07:46 +0000
Subject: [MTD] NAND Add optional ECC status check callback

Add optional hardware specific callback routine to perform extra error
status checks on erase and write failures for devices with hardware ECC.

Signed-off-by: David A. Marlin <dmarlin@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 65 ++++++++++++++++++++++++++++++++++++--------
 include/linux/mtd/nand.h     | 16 +++++++++--
 2 files changed, 68 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 9f7c42ceecfa..7094dd5716dc 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -42,6 +42,10 @@
  * 		a "device recovery" operation must be performed when power is restored
  * 		to ensure correct operation.
  *
+ *  01-20-2005	dmarlin: added support for optional hardware specific callback routine to 
+ *		perform extra error status checks on erase and write failures.  This required
+ *		adding a wrapper function for nand_read_ecc.
+ *
  * Credits:
  *	David Woodhouse for adding multichip support  
  *	
@@ -55,7 +59,7 @@
  *	The AG-AND chips have nice features for speed improvement,
  *	which are not supported yet. Read / program 4 pages in one go.
  *
- * $Id: nand_base.c,v 1.129 2005/01/23 18:30:50 dmarlin Exp $
+ * $Id: nand_base.c,v 1.130 2005/01/24 03:07:43 dmarlin Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -896,6 +900,12 @@ static int nand_write_page (struct mtd_info *mtd, struct nand_chip *this, int pa
 	if (!cached) {
 		/* call wait ready function */
 		status = this->waitfunc (mtd, this, FL_WRITING);
+
+		/* See if operation failed and additional status checks are available */
+		if ((status & NAND_STATUS_FAIL) && (this->errstat)) {
+			status = this->errstat(mtd, this, FL_WRITING, status, page);
+		}
+
 		/* See if device thinks it succeeded */
 		if (status & NAND_STATUS_FAIL) {
 			DEBUG (MTD_DEBUG_LEVEL0, "%s: " "Failed write, page 0x%08x, ", __FUNCTION__, page);
@@ -1022,23 +1032,24 @@ out:
 #endif
 
 /**
- * nand_read - [MTD Interface] MTD compability function for nand_read_ecc
+ * nand_read - [MTD Interface] MTD compability function for nand_do_read_ecc
  * @mtd:	MTD device structure
  * @from:	offset to read from
  * @len:	number of bytes to read
  * @retlen:	pointer to variable to store the number of read bytes
  * @buf:	the databuffer to put data
  *
- * This function simply calls nand_read_ecc with oob buffer and oobsel = NULL
-*/
+ * This function simply calls nand_do_read_ecc with oob buffer and oobsel = NULL
+ * and flags = 0xff
+ */
 static int nand_read (struct mtd_info *mtd, loff_t from, size_t len, size_t * retlen, u_char * buf)
 {
-	return nand_read_ecc (mtd, from, len, retlen, buf, NULL, NULL);
+	return nand_do_read_ecc (mtd, from, len, retlen, buf, NULL, NULL, 0xff);
 }			   
 
 
 /**
- * nand_read_ecc - [MTD Interface] Read data with ECC
+ * nand_read_ecc - [MTD Interface] MTD compability function for nand_do_read_ecc
  * @mtd:	MTD device structure
  * @from:	offset to read from
  * @len:	number of bytes to read
@@ -1047,10 +1058,34 @@ static int nand_read (struct mtd_info *mtd, loff_t from, size_t len, size_t * re
  * @oob_buf:	filesystem supplied oob data buffer
  * @oobsel:	oob selection structure
  *
- * NAND read with ECC
+ * This function simply calls nand_do_read_ecc with flags = 0xff
  */
 static int nand_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 			  size_t * retlen, u_char * buf, u_char * oob_buf, struct nand_oobinfo *oobsel)
+{
+	return nand_do_read_ecc(mtd, from, len, retlen, buf, oob_buf, oobsel, 0xff);
+}
+
+
+/**
+ * nand_do_read_ecc - [MTD Interface] Read data with ECC
+ * @mtd:	MTD device structure
+ * @from:	offset to read from
+ * @len:	number of bytes to read
+ * @retlen:	pointer to variable to store the number of read bytes
+ * @buf:	the databuffer to put data
+ * @oob_buf:	filesystem supplied oob data buffer
+ * @oobsel:	oob selection structure
+ * @flags:	flag to indicate if nand_get_device/nand_release_device should be preformed
+ *		and how many corrected error bits are acceptable:
+ *		  bits 0..7 - number of tolerable errors
+ *		  bit  8    - 0 == do not get/release chip, 1 == get/release chip
+ *
+ * NAND read with ECC
+ */
+int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
+			     size_t * retlen, u_char * buf, u_char * oob_buf, 
+			     struct nand_oobinfo *oobsel, int flags)
 {
 	int i, j, col, realpage, page, end, ecc, chipnr, sndcmd = 1;
 	int read = 0, oob = 0, ecc_status = 0, ecc_failed = 0;
@@ -1076,7 +1111,8 @@ static int nand_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 	}
 
 	/* Grab the lock and see if the device is available */
-	nand_get_device (this, mtd, FL_READING);
+	if (flags & NAND_GET_DEVICE)
+		nand_get_device (this, mtd, FL_READING);
 
 	/* use userspace supplied oobinfo, if zero */
 	if (oobsel == NULL)
@@ -1180,7 +1216,8 @@ static int nand_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 					/* We calc error correction directly, it checks the hw
 					 * generator for an error, reads back the syndrome and
 					 * does the error correction on the fly */
-					if (this->correct_data(mtd, &data_poi[datidx], &oob_data[i], &ecc_code[i]) == -1) {
+					ecc_status = this->correct_data(mtd, &data_poi[datidx], &oob_data[i], &ecc_code[i]);
+					if ((ecc_status == -1) || (ecc_status > (flags && 0xff))) {
 						DEBUG (MTD_DEBUG_LEVEL0, "nand_read_ecc: " 
 							"Failed ECC read, page 0x%08x on chip %d\n", page, chipnr);
 						ecc_failed++;
@@ -1219,7 +1256,7 @@ static int nand_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 				p[i] = ecc_status;
 			}
 			
-			if (ecc_status == -1) {	
+			if ((ecc_status == -1) || (ecc_status > (flags && 0xff))) {	
 				DEBUG (MTD_DEBUG_LEVEL0, "nand_read_ecc: " "Failed ECC read, page 0x%08x\n", page);
 				ecc_failed++;
 			}
@@ -1289,7 +1326,8 @@ static int nand_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
 	}
 
 	/* Deselect and wake up anyone waiting on the device */
-	nand_release_device(mtd);
+	if (flags & NAND_GET_DEVICE)
+		nand_release_device(mtd);
 
 	/*
 	 * Return success, if no ECC failures, else -EBADMSG
@@ -2103,6 +2141,11 @@ int nand_erase_nand (struct mtd_info *mtd, struct erase_info *instr, int allowbb
 		
 		status = this->waitfunc (mtd, this, FL_ERASING);
 
+		/* See if operation failed and additional status checks are available */
+		if ((status & NAND_STATUS_FAIL) && (this->errstat)) {
+			status = this->errstat(mtd, this, FL_ERASING, status, page);
+		}
+
 		/* See if block erase succeeded */
 		if (status & NAND_STATUS_FAIL) {
 			DEBUG (MTD_DEBUG_LEVEL0, "nand_erase: " "Failed erase, page 0x%08x\n", page);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 0118128ae384..cf52f20c6de2 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -5,7 +5,7 @@
  *                     Steven J. Hill <sjhill@realitydiluted.com>
  *		       Thomas Gleixner <tglx@linutronix.de>
  *
- * $Id: nand.h,v 1.69 2005/01/17 18:29:18 dmarlin Exp $
+ * $Id: nand.h,v 1.70 2005/01/24 03:07:42 dmarlin Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -50,6 +50,8 @@
  *			update of nand_chip structure description
  *  01-17-2005 dmarlin	added extended commands for AG-AND device and added option 
  * 			for BBT_AUTO_REFRESH.
+ *  01-20-2005 dmarlin	added optional pointer to hardware specific callback for 
+ *			extra error status checks.
  */
 #ifndef __LINUX_MTD_NAND_H
 #define __LINUX_MTD_NAND_H
@@ -164,7 +166,7 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
 
 /*
  * Constants for Hardware ECC
-*/
+ */
 /* Reset Hardware ECC for read */
 #define NAND_ECC_READ		0
 /* Reset Hardware ECC for write */
@@ -172,6 +174,10 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
 /* Enable Hardware ECC before syndrom is read back from flash */
 #define NAND_ECC_READSYN	2
 
+/* Bit mask for flags passed to do_nand_read_ecc */
+#define NAND_GET_DEVICE		0x80
+
+
 /* Option constants for bizarre disfunctionality and real
 *  features
 */
@@ -308,6 +314,8 @@ struct nand_hw_control {
  * @badblock_pattern:	[REPLACEABLE] bad block scan pattern used for initial bad block scan 
  * @controller:		[OPTIONAL] a pointer to a hardware controller structure which is shared among multiple independend devices
  * @priv:		[OPTIONAL] pointer to private chip date
+ * @errstat:		[OPTIONAL] hardware specific function to perform additional error status checks 
+ *			(determine if errors are correctable)
  */
  
 struct nand_chip {
@@ -363,6 +371,7 @@ struct nand_chip {
 	struct nand_bbt_descr	*badblock_pattern;
 	struct nand_hw_control  *controller;
 	void		*priv;
+	int		(*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page);
 };
 
 /*
@@ -484,6 +493,9 @@ extern int nand_update_bbt (struct mtd_info *mtd, loff_t offs);
 extern int nand_default_bbt (struct mtd_info *mtd);
 extern int nand_isbad_bbt (struct mtd_info *mtd, loff_t offs, int allowbbt);
 extern int nand_erase_nand (struct mtd_info *mtd, struct erase_info *instr, int allowbbt);
+extern int nand_do_read_ecc (struct mtd_info *mtd, loff_t from, size_t len,
+                             size_t * retlen, u_char * buf, u_char * oob_buf,
+                             struct nand_oobinfo *oobsel, int flags);
 
 /*
 * Constants for oob configuration
-- 
cgit v1.2.3-59-g8ed1b


From 72b56a2d7dccd9ea90f34f6ddb653086a3f3bd2e Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Sat, 5 Feb 2005 02:06:19 +0000
Subject: [MTD] Add OTP basisc

add structure definition for OTP region info

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/chips/cfi_cmdset_0001.c |  8 +++++---
 include/linux/mtd/cfi.h             | 10 +++++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index c268bcd71720..c630d7532f7a 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -4,7 +4,7 @@
  *
  * (C) 2000 Red Hat. GPL'd
  *
- * $Id: cfi_cmdset_0001.c,v 1.164 2004/11/16 18:29:00 dwmw2 Exp $
+ * $Id: cfi_cmdset_0001.c,v 1.165 2005/02/05 02:06:15 nico Exp $
  *
  * 
  * 10/10/2000	Nicolas Pitre <nico@cam.org>
@@ -252,7 +252,8 @@ read_pri_intelext(struct map_info *map, __u16 adr)
 		int nb_parts, i;
 
 		/* Protection Register info */
-		extra_size += (extp->NumProtectionFields - 1) * (4 + 6);
+		extra_size += (extp->NumProtectionFields - 1) *
+			      sizeof(struct cfi_intelext_otpinfo);
 
 		/* Burst Read info */
 		extra_size += 6;
@@ -471,7 +472,8 @@ static int cfi_intelext_partition_fixup(struct mtd_info *mtd,
 		int offs, numregions, numparts, partshift, numvirtchips, i, j;
 
 		/* Protection Register info */
-		offs = (extp->NumProtectionFields - 1) * (4 + 6);
+		offs = (extp->NumProtectionFields - 1) *
+		       sizeof(struct cfi_intelext_otpinfo);
 
 		/* Burst Read info */
 		offs += 6;
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 2ed8c585021e..d87dc3fbd4ba 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -1,7 +1,7 @@
 
 /* Common Flash Interface structures 
  * See http://support.intel.com/design/flash/technote/index.htm
- * $Id: cfi.h,v 1.50 2004/11/20 12:46:51 dwmw2 Exp $
+ * $Id: cfi.h,v 1.51 2005/02/05 02:06:16 nico Exp $
  */
 
 #ifndef __MTD_CFI_H__
@@ -148,6 +148,14 @@ struct cfi_pri_intelext {
 	uint8_t  extra[0];
 } __attribute__((packed));
 
+struct cfi_intelext_otpinfo {
+	uint32_t ProtRegAddr;
+	uint16_t FactGroups;
+	uint8_t  FactProtRegSize;
+	uint16_t UserGroups;
+	uint8_t  UserProtRegSize;
+} __attribute__((packed));
+
 struct cfi_intelext_blockinfo {
 	uint16_t NumIdentBlocks;
 	uint16_t BlockSize;
-- 
cgit v1.2.3-59-g8ed1b


From f77814dd5728edaf1239d19755d2aa0d8c33d861 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Tue, 8 Feb 2005 17:11:19 +0000
Subject: [MTD] Support for protection register support on Intel FLASH chips

This enables support for reading, writing and locking so called
"Protection Registers" present on some flash chips.
A subset of them are pre-programmed at the factory with a
unique set of values. The rest is user-programmable.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/chips/Kconfig           |  27 ++-
 drivers/mtd/chips/cfi_cmdset_0001.c | 401 +++++++++++++++++++++++++-----------
 drivers/mtd/mtdpart.c               |  28 ++-
 include/linux/mtd/cfi.h             |   4 +-
 include/linux/mtd/flashchip.h       |   3 +-
 include/linux/mtd/map.h             |  15 +-
 include/linux/mtd/mtd.h             |  10 +-
 include/mtd/mtd-abi.h               |   8 +-
 8 files changed, 369 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig
index d682dbc8157e..f4eda1e40d51 100644
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -1,5 +1,5 @@
 # drivers/mtd/chips/Kconfig
-# $Id: Kconfig,v 1.13 2004/12/01 15:49:10 nico Exp $
+# $Id: Kconfig,v 1.14 2005/02/08 17:11:15 nico Exp $
 
 menu "RAM/ROM/Flash chip drivers"
 	depends on MTD!=n
@@ -155,6 +155,31 @@ config MTD_CFI_I8
 	  If your flash chips are interleaved in eights - i.e. you have eight
 	  flash chips addressed by each bus cycle, then say 'Y'.
 
+config MTD_OTP
+	bool "Protection Registers aka one-time programmable (OTP) bits"
+	depends on MTD_CFI_ADV_OPTIONS
+	default n
+	help
+	  This enables support for reading, writing and locking so called
+	  "Protection Registers" present on some flash chips.
+	  A subset of them are pre-programmed at the factory with a
+	  unique set of values. The rest is user-programmable.
+
+	  The user-programmable Protection Registers contain one-time
+	  programmable (OTP) bits; when programmed, register bits cannot be
+	  erased. Each Protection Register can be accessed multiple times to
+	  program individual bits, as long as the register remains unlocked.
+
+	  Each Protection Register has an associated Lock Register bit. When a
+	  Lock Register bit is programmed, the associated Protection Register
+	  can only be read; it can no longer be programmed. Additionally,
+	  because the Lock Register bits themselves are OTP, when programmed,
+	  Lock Register bits cannot be erased. Therefore, when a Protection
+	  Register is locked, it cannot be unlocked.
+
+	  This feature should therefore be used with extreme care. Any mistake
+	  in the programming of OTP bits will waste them.
+
 config MTD_CFI_INTELEXT
 	tristate "Support for Intel/Sharp flash chips"
 	depends on MTD_GEN_PROBE
diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index c630d7532f7a..b3f5acf0760c 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -4,7 +4,7 @@
  *
  * (C) 2000 Red Hat. GPL'd
  *
- * $Id: cfi_cmdset_0001.c,v 1.165 2005/02/05 02:06:15 nico Exp $
+ * $Id: cfi_cmdset_0001.c,v 1.167 2005/02/08 17:11:15 nico Exp $
  *
  * 
  * 10/10/2000	Nicolas Pitre <nico@cam.org>
@@ -48,14 +48,20 @@
 #define M50LPW080       0x002F
 
 static int cfi_intelext_read (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
-//static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
-//static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
 static int cfi_intelext_write_words(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
 static int cfi_intelext_write_buffers(struct mtd_info *, loff_t, size_t, size_t *, const u_char *);
 static int cfi_intelext_erase_varsize(struct mtd_info *, struct erase_info *);
 static void cfi_intelext_sync (struct mtd_info *);
 static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len);
 static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len);
+static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
+static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
+static int cfi_intelext_write_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *);
+static int cfi_intelext_lock_user_prot_reg (struct mtd_info *, loff_t, size_t);
+static int cfi_intelext_get_fact_prot_info (struct mtd_info *,
+					    struct otp_info *, size_t);
+static int cfi_intelext_get_user_prot_info (struct mtd_info *,
+					    struct otp_info *, size_t);
 static int cfi_intelext_suspend (struct mtd_info *);
 static void cfi_intelext_resume (struct mtd_info *);
 
@@ -423,9 +429,13 @@ static struct mtd_info *cfi_intelext_setup(struct mtd_info *mtd)
 		       mtd->eraseregions[i].numblocks);
 	}
 
-#if 0
-	mtd->read_user_prot_reg = cfi_intelext_read_user_prot_reg;
+#ifdef CONFIG_MTD_OTP
 	mtd->read_fact_prot_reg = cfi_intelext_read_fact_prot_reg;
+	mtd->read_user_prot_reg = cfi_intelext_read_user_prot_reg;
+	mtd->write_user_prot_reg = cfi_intelext_write_user_prot_reg;
+	mtd->lock_user_prot_reg = cfi_intelext_lock_user_prot_reg;
+	mtd->get_fact_prot_info = cfi_intelext_get_fact_prot_info;
+	mtd->get_user_prot_info = cfi_intelext_get_user_prot_info;
 #endif
 
 	/* This function has the potential to distort the reality
@@ -565,7 +575,7 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
  resettime:
 	timeo = jiffies + HZ;
  retry:
-	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING)) {
+	if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING || mode == FL_OTP_WRITE)) {
 		/*
 		 * OK. We have possibility for contension on the write/erase
 		 * operations which are global to the real chip and not per
@@ -1178,111 +1188,11 @@ static int cfi_intelext_read (struct mtd_info *mtd, loff_t from, size_t len, siz
 	return ret;
 }
 
-#if 0
-static int __xipram cfi_intelext_read_prot_reg (struct mtd_info *mtd,
-						loff_t from, size_t len,
-						size_t *retlen,
-						u_char *buf,
-						int base_offst, int reg_sz)
-{
-	struct map_info *map = mtd->priv;
-	struct cfi_private *cfi = map->fldrv_priv;
-	struct cfi_pri_intelext *extp = cfi->cmdset_priv;
-	struct flchip *chip;
-	int ofs_factor = cfi->interleave * cfi->device_type;
-	int count = len;
-	int chip_num, offst;
-	int ret;
-
-	chip_num = ((unsigned int)from/reg_sz);
-	offst = from - (reg_sz*chip_num)+base_offst;
-
-	while (count) {
-	/* Calculate which chip & protection register offset we need */
-
-		if (chip_num >= cfi->numchips)
-			goto out;
-
-		chip = &cfi->chips[chip_num];
-		
-		spin_lock(chip->mutex);
-		ret = get_chip(map, chip, chip->start, FL_JEDEC_QUERY);
-		if (ret) {
-			spin_unlock(chip->mutex);
-			return (len-count)?:ret;
-		}
-
-		xip_disable(map, chip, chip->start);
-
-		if (chip->state != FL_JEDEC_QUERY) {
-			map_write(map, CMD(0x90), chip->start);
-			chip->state = FL_JEDEC_QUERY;
-		}
-
-		while (count && ((offst-base_offst) < reg_sz)) {
-			*buf = map_read8(map,(chip->start+((extp->ProtRegAddr+1)*ofs_factor)+offst));
-			buf++;
-			offst++;
-			count--;
-		}
-
-		xip_enable(map, chip, chip->start);
-		put_chip(map, chip, chip->start);
-		spin_unlock(chip->mutex);
-
-		/* Move on to the next chip */
-		chip_num++;
-		offst = base_offst;
-	}
-	
- out:	
-	return len-count;
-}
-	
-static int cfi_intelext_read_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
-{
-	struct map_info *map = mtd->priv;
-	struct cfi_private *cfi = map->fldrv_priv;
-	struct cfi_pri_intelext *extp=cfi->cmdset_priv;
-	int base_offst,reg_sz;
-	
-	/* Check that we actually have some protection registers */
-	if(!extp || !(extp->FeatureSupport&64)){
-		printk(KERN_WARNING "%s: This flash device has no protection data to read!\n",map->name);
-		return 0;
-	}
-
-	base_offst=(1<<extp->FactProtRegSize);
-	reg_sz=(1<<extp->UserProtRegSize);
-
-	return cfi_intelext_read_prot_reg(mtd, from, len, retlen, buf, base_offst, reg_sz);
-}
-
-static int cfi_intelext_read_fact_prot_reg (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf)
-{
-	struct map_info *map = mtd->priv;
-	struct cfi_private *cfi = map->fldrv_priv;
-	struct cfi_pri_intelext *extp=cfi->cmdset_priv;
-	int base_offst,reg_sz;
-	
-	/* Check that we actually have some protection registers */
-	if(!extp || !(extp->FeatureSupport&64)){
-		printk(KERN_WARNING "%s: This flash device has no protection data to read!\n",map->name);
-		return 0;
-	}
-
-	base_offst=0;
-	reg_sz=(1<<extp->FactProtRegSize);
-
-	return cfi_intelext_read_prot_reg(mtd, from, len, retlen, buf, base_offst, reg_sz);
-}
-#endif
-
 static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
-				     unsigned long adr, map_word datum)
+				     unsigned long adr, map_word datum, int mode)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
-	map_word status, status_OK;
+	map_word status, status_OK, write_cmd;
 	unsigned long timeo;
 	int z, ret=0;
 
@@ -1290,9 +1200,14 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 
 	/* Let's determine this according to the interleave only once */
 	status_OK = CMD(0x80);
+	switch (mode) {
+	case FL_WRITING:   write_cmd = CMD(0x40); break;
+	case FL_OTP_WRITE: write_cmd = CMD(0xc0); break;
+	default: return -EINVAL;
+	}
 
 	spin_lock(chip->mutex);
-	ret = get_chip(map, chip, adr, FL_WRITING);
+	ret = get_chip(map, chip, adr, mode);
 	if (ret) {
 		spin_unlock(chip->mutex);
 		return ret;
@@ -1301,9 +1216,9 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 	XIP_INVAL_CACHED_RANGE(map, adr, map_bankwidth(map));
 	ENABLE_VPP(map);
 	xip_disable(map, chip, adr);
-	map_write(map, CMD(0x40), adr);
+	map_write(map, write_cmd, adr);
 	map_write(map, datum, adr);
-	chip->state = FL_WRITING;
+	chip->state = mode;
 
 	spin_unlock(chip->mutex);
 	INVALIDATE_CACHED_RANGE(map, adr, map_bankwidth(map));
@@ -1313,7 +1228,7 @@ static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip,
 	timeo = jiffies + (HZ/2);
 	z = 0;
 	for (;;) {
-		if (chip->state != FL_WRITING) {
+		if (chip->state != mode) {
 			/* Someone's suspended the write. Sleep */
 			DECLARE_WAITQUEUE(wait, current);
 
@@ -1401,7 +1316,7 @@ static int cfi_intelext_write_words (struct mtd_info *mtd, loff_t to , size_t le
 		datum = map_word_load_partial(map, datum, buf, gap, n);
 
 		ret = do_write_oneword(map, &cfi->chips[chipnum],
-					       bus_ofs, datum);
+					       bus_ofs, datum, FL_WRITING);
 		if (ret) 
 			return ret;
 
@@ -1422,7 +1337,7 @@ static int cfi_intelext_write_words (struct mtd_info *mtd, loff_t to , size_t le
 		map_word datum = map_word_load(map, buf);
 
 		ret = do_write_oneword(map, &cfi->chips[chipnum],
-				ofs, datum);
+				       ofs, datum, FL_WRITING);
 		if (ret)
 			return ret;
 
@@ -1446,7 +1361,7 @@ static int cfi_intelext_write_words (struct mtd_info *mtd, loff_t to , size_t le
 		datum = map_word_load_partial(map, datum, buf, 0, len);
 
 		ret = do_write_oneword(map, &cfi->chips[chipnum],
-					       ofs, datum);
+				       ofs, datum, FL_WRITING);
 		if (ret) 
 			return ret;
 		
@@ -2036,6 +1951,262 @@ static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len)
 	return ret;
 }
 
+#ifdef CONFIG_MTD_OTP
+
+typedef int (*otp_op_t)(struct map_info *map, struct flchip *chip, 
+			u_long data_offset, u_char *buf, u_int size,
+			u_long prot_offset, u_int groupno, u_int groupsize);
+
+static int __xipram
+do_otp_read(struct map_info *map, struct flchip *chip, u_long offset,
+	    u_char *buf, u_int size, u_long prot, u_int grpno, u_int grpsz)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	int ret;
+
+	spin_lock(chip->mutex);
+	ret = get_chip(map, chip, chip->start, FL_JEDEC_QUERY);
+	if (ret) {
+		spin_unlock(chip->mutex);
+		return ret;
+	}
+
+	/* let's ensure we're not reading back cached data from array mode */
+	if (map->inval_cache)
+		map->inval_cache(map, chip->start + offset, size);
+
+	xip_disable(map, chip, chip->start);
+	if (chip->state != FL_JEDEC_QUERY) {
+		map_write(map, CMD(0x90), chip->start);
+		chip->state = FL_JEDEC_QUERY;
+	}
+	map_copy_from(map, buf, chip->start + offset, size);
+	xip_enable(map, chip, chip->start);
+
+	/* then ensure we don't keep OTP data in the cache */
+	if (map->inval_cache)
+		map->inval_cache(map, chip->start + offset, size);
+
+	put_chip(map, chip, chip->start);
+	spin_unlock(chip->mutex);
+	return 0;
+}
+
+static int
+do_otp_write(struct map_info *map, struct flchip *chip, u_long offset,
+	     u_char *buf, u_int size, u_long prot, u_int grpno, u_int grpsz)
+{
+	int ret;
+
+	while (size) {
+		unsigned long bus_ofs = offset & ~(map_bankwidth(map)-1);
+		int gap = offset - bus_ofs;
+		int n = min_t(int, size, map_bankwidth(map)-gap);
+		map_word datum = map_word_ff(map);
+
+		datum = map_word_load_partial(map, datum, buf, gap, n);
+		ret = do_write_oneword(map, chip, bus_ofs, datum, FL_OTP_WRITE);
+		if (ret) 
+			return ret;
+
+		offset += n;
+		buf += n;
+		size -= n;
+	}
+
+	return 0;
+}
+
+static int
+do_otp_lock(struct map_info *map, struct flchip *chip, u_long offset,
+	    u_char *buf, u_int size, u_long prot, u_int grpno, u_int grpsz)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	map_word datum;
+
+	/* make sure area matches group boundaries */
+	if (offset != 0 || size != grpsz)
+		return -EXDEV;
+
+	datum = map_word_ff(map);
+	datum = map_word_clr(map, datum, CMD(1 << grpno));
+	return do_write_oneword(map, chip, prot, datum, FL_OTP_WRITE);
+}
+
+static int cfi_intelext_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
+				 size_t *retlen, u_char *buf,
+				 otp_op_t action, int user_regs)
+{
+	struct map_info *map = mtd->priv;
+	struct cfi_private *cfi = map->fldrv_priv;
+	struct cfi_pri_intelext *extp = cfi->cmdset_priv;
+	struct flchip *chip;
+	struct cfi_intelext_otpinfo *otp;
+	u_long devsize, reg_prot_offset, data_offset;
+	u_int chip_num, chip_step, field, reg_fact_size, reg_user_size;
+	u_int groups, groupno, groupsize, reg_fact_groups, reg_user_groups;
+	int ret;
+
+	*retlen = 0;
+
+	/* Check that we actually have some OTP registers */
+	if (!extp || !(extp->FeatureSupport & 64) || !extp->NumProtectionFields)
+		return -ENODATA;
+
+	/* we need real chips here not virtual ones */
+	devsize = (1 << cfi->cfiq->DevSize) * cfi->interleave;
+	chip_step = devsize >> cfi->chipshift;
+
+	for (chip_num = 0; chip_num < cfi->numchips; chip_num += chip_step) {
+		chip = &cfi->chips[chip_num];
+		otp = (struct cfi_intelext_otpinfo *)&extp->extra[0];
+
+		/* first OTP region */
+		field = 0;
+		reg_prot_offset = extp->ProtRegAddr;
+		reg_fact_groups = 1;
+		reg_fact_size = 1 << extp->FactProtRegSize;
+		reg_user_groups = 1;
+		reg_user_size = 1 << extp->UserProtRegSize;
+
+		while (len > 0) {
+			/* flash geometry fixup */
+			data_offset = reg_prot_offset + 1;
+			data_offset *= cfi->interleave * cfi->device_type;
+			reg_prot_offset *= cfi->interleave * cfi->device_type;
+			reg_fact_size *= cfi->interleave;
+			reg_user_size *= cfi->interleave;
+
+			if (user_regs) {
+				groups = reg_user_groups;
+				groupsize = reg_user_size;
+				/* skip over factory reg area */
+				groupno = reg_fact_groups;
+				data_offset += reg_fact_groups * reg_fact_size;
+			} else {
+				groups = reg_fact_groups;
+				groupsize = reg_fact_size;
+				groupno = 0;
+			}
+
+			while (groups > 0) {
+				if (!action) {
+					/*
+					 * Special case: if action is NULL
+					 * we fill buf with otp_info records.
+					 */
+					struct otp_info *otpinfo;
+					map_word lockword;
+					len -= sizeof(struct otp_info);
+					if (len <= 0)
+						return -ENOSPC;
+					ret = do_otp_read(map, chip,
+							  reg_prot_offset,
+							  (u_char *)&lockword,
+							  map_bankwidth(map),
+							  0, 0,  0);
+					if (ret)
+						return ret;
+					otpinfo = (struct otp_info *)buf;
+					otpinfo->start = from;
+					otpinfo->length = groupsize;
+					otpinfo->locked =
+					   !map_word_bitsset(map, lockword,
+							     CMD(1 << groupno));
+					from += groupsize;
+					buf += sizeof(*otpinfo);
+					*retlen += sizeof(*otpinfo);
+				} else if (from >= groupsize) {
+					from -= groupsize;
+				} else {
+					int size = groupsize;
+					data_offset += from;
+					size -= from;
+					from = 0;
+					if (size > len)
+						size = len;
+					ret = action(map, chip, data_offset,
+						     buf, size, reg_prot_offset,
+						     groupno, groupsize);
+					if (ret < 0)
+						return ret;
+					buf += size;
+					len -= size;
+					*retlen += size;
+				}
+				groupno++;
+				groups--;
+			}
+
+			/* next OTP region */
+			if (++field == extp->NumProtectionFields)
+				break;
+			reg_prot_offset = otp->ProtRegAddr;
+			reg_fact_groups = otp->FactGroups;
+			reg_fact_size = 1 << otp->FactProtRegSize;
+			reg_user_groups = otp->UserGroups;
+			reg_user_size = 1 << otp->UserProtRegSize;
+			otp++;
+		}
+	}
+
+	return 0;
+}
+
+static int cfi_intelext_read_fact_prot_reg(struct mtd_info *mtd, loff_t from,
+					   size_t len, size_t *retlen,
+					    u_char *buf)
+{
+	return cfi_intelext_otp_walk(mtd, from, len, retlen,
+				     buf, do_otp_read, 0);
+}
+
+static int cfi_intelext_read_user_prot_reg(struct mtd_info *mtd, loff_t from,
+					   size_t len, size_t *retlen,
+					    u_char *buf)
+{
+	return cfi_intelext_otp_walk(mtd, from, len, retlen,
+				     buf, do_otp_read, 1);
+}
+
+static int cfi_intelext_write_user_prot_reg(struct mtd_info *mtd, loff_t from,
+					    size_t len, size_t *retlen,
+					     u_char *buf)
+{
+	return cfi_intelext_otp_walk(mtd, from, len, retlen,
+				     buf, do_otp_write, 1);
+}
+
+static int cfi_intelext_lock_user_prot_reg(struct mtd_info *mtd,
+					   loff_t from, size_t len)
+{
+	size_t retlen;
+	return cfi_intelext_otp_walk(mtd, from, len, &retlen,
+				     NULL, do_otp_lock, 1);
+}
+
+static int cfi_intelext_get_fact_prot_info(struct mtd_info *mtd, 
+					   struct otp_info *buf, size_t len)
+{
+	size_t retlen;
+	int ret;
+
+	ret = cfi_intelext_otp_walk(mtd, 0, len, &retlen, (u_char *)buf, NULL, 0);
+	return ret ? : retlen;
+}
+
+static int cfi_intelext_get_user_prot_info(struct mtd_info *mtd,
+					   struct otp_info *buf, size_t len)
+{
+	size_t retlen;
+	int ret;
+
+	ret = cfi_intelext_otp_walk(mtd, 0, len, &retlen, (u_char *)buf, NULL, 1);
+	return ret ? : retlen;
+}
+
+#endif
+
 static int cfi_intelext_suspend(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 96ebb52f24b1..b92e6bfffaf2 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -5,7 +5,7 @@
  *
  * This code is GPL
  *
- * $Id: mtdpart.c,v 1.51 2004/11/16 18:28:59 dwmw2 Exp $
+ * $Id: mtdpart.c,v 1.53 2005/02/08 17:11:13 nico Exp $
  *
  * 	02-21-2002	Thomas Gleixner <gleixner@autronix.de>
  *			added support for read_oob, write_oob
@@ -116,6 +116,13 @@ static int part_read_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t le
 					len, retlen, buf);
 }
 
+static int part_get_user_prot_info (struct mtd_info *mtd,
+				    struct otp_info *buf, size_t len)
+{
+	struct mtd_part *part = PART(mtd);
+	return part->master->get_user_prot_info (part->master, buf, len);
+}
+
 static int part_read_fact_prot_reg (struct mtd_info *mtd, loff_t from, size_t len, 
 			size_t *retlen, u_char *buf)
 {
@@ -124,6 +131,13 @@ static int part_read_fact_prot_reg (struct mtd_info *mtd, loff_t from, size_t le
 					len, retlen, buf);
 }
 
+static int part_get_fact_prot_info (struct mtd_info *mtd,
+				    struct otp_info *buf, size_t len)
+{
+	struct mtd_part *part = PART(mtd);
+	return part->master->get_fact_prot_info (part->master, buf, len);
+}
+
 static int part_write (struct mtd_info *mtd, loff_t to, size_t len,
 			size_t *retlen, const u_char *buf)
 {
@@ -182,6 +196,12 @@ static int part_write_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t l
 					len, retlen, buf);
 }
 
+static int part_lock_user_prot_reg (struct mtd_info *mtd, loff_t from, size_t len) 
+{
+	struct mtd_part *part = PART(mtd);
+	return part->master->lock_user_prot_reg (part->master, from, len);
+}
+
 static int part_writev (struct mtd_info *mtd,  const struct kvec *vecs,
 			 unsigned long count, loff_t to, size_t *retlen)
 {
@@ -409,6 +429,12 @@ int add_mtd_partitions(struct mtd_info *master,
 			slave->mtd.read_fact_prot_reg = part_read_fact_prot_reg;
 		if(master->write_user_prot_reg)
 			slave->mtd.write_user_prot_reg = part_write_user_prot_reg;
+		if(master->lock_user_prot_reg)
+			slave->mtd.lock_user_prot_reg = part_lock_user_prot_reg;
+		if(master->get_user_prot_info)
+			slave->mtd.get_user_prot_info = part_get_user_prot_info;
+		if(master->get_fact_prot_info)
+			slave->mtd.get_fact_prot_info = part_get_fact_prot_info;
 		if (master->sync)
 			slave->mtd.sync = part_sync;
 		if (!i && master->suspend && master->resume) {
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index d87dc3fbd4ba..76255474a27c 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -1,7 +1,7 @@
 
 /* Common Flash Interface structures 
  * See http://support.intel.com/design/flash/technote/index.htm
- * $Id: cfi.h,v 1.51 2005/02/05 02:06:16 nico Exp $
+ * $Id: cfi.h,v 1.52 2005/02/08 17:11:15 nico Exp $
  */
 
 #ifndef __MTD_CFI_H__
@@ -252,7 +252,7 @@ static inline uint32_t cfi_build_cmd_addr(uint32_t cmd_ofs, int interleave, int
  * It looks too long to be inline, but in the common case it should almost all
  * get optimised away. 
  */
-static inline map_word cfi_build_cmd(u_char cmd, struct map_info *map, struct cfi_private *cfi)
+static inline map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cfi_private *cfi)
 {
 	map_word val = { {0} };
 	int wordwidth, words_per_bus, chip_mode, chips_per_word;
diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index c66ba812bf90..e778a1ab23c4 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -6,7 +6,7 @@
  *
  * (C) 2000 Red Hat. GPLd.
  *
- * $Id: flashchip.h,v 1.15 2004/11/05 22:41:06 nico Exp $
+ * $Id: flashchip.h,v 1.16 2005/02/08 17:11:15 nico Exp $
  *
  */
 
@@ -29,6 +29,7 @@ typedef enum {
 	FL_ERASE_SUSPENDED,
 	FL_WRITING,
 	FL_WRITING_TO_BUFFER,
+	FL_OTP_WRITE,
 	FL_WRITE_SUSPENDING,
 	FL_WRITE_SUSPENDED,
 	FL_PM_SUSPENDED,
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index f0268b99c900..8fc6679aa9b1 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,6 +1,6 @@
 
 /* Overhauled routines for dealing with different mmap regions of flash */
-/* $Id: map.h,v 1.46 2005/01/05 17:09:44 dwmw2 Exp $ */
+/* $Id: map.h,v 1.47 2005/02/08 17:11:15 nico Exp $ */
 
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
@@ -263,6 +263,17 @@ static inline map_word map_word_and(struct map_info *map, map_word val1, map_wor
 	return r;
 }
 
+static inline map_word map_word_clr(struct map_info *map, map_word val1, map_word val2)
+{
+	map_word r;
+	int i;
+
+	for (i=0; i<map_words(map); i++) {
+		r.x[i] = val1.x[i] & ~val2.x[i];
+	}
+	return r;
+}
+
 static inline map_word map_word_or(struct map_info *map, map_word val1, map_word val2)
 {
 	map_word r;
@@ -273,6 +284,7 @@ static inline map_word map_word_or(struct map_info *map, map_word val1, map_word
 	}
 	return r;
 }
+
 #define map_word_andequal(m, a, b, z) map_word_equal(m, z, map_word_and(m, a, b))
 
 static inline int map_word_bitsset(struct map_info *map, map_word val1, map_word val2)
@@ -338,6 +350,7 @@ static inline map_word map_word_ff(struct map_info *map)
 	}
 	return r;
 }
+
 static inline map_word inline_map_read(struct map_info *map, unsigned long ofs)
 {
 	map_word r;
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index b3d134392b31..3aab1b8729e0 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -1,5 +1,5 @@
 /* 
- * $Id: mtd.h,v 1.56 2004/08/09 18:46:04 dmarlin Exp $
+ * $Id: mtd.h,v 1.57 2005/02/08 17:11:15 nico Exp $
  *
  * Copyright (C) 1999-2003 David Woodhouse <dwmw2@infradead.org> et al.
  *
@@ -113,12 +113,12 @@ struct mtd_info {
 	 * flash devices. The user data is one time programmable but the
 	 * factory data is read only. 
 	 */
-	int (*read_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
-
+	int (*get_fact_prot_info) (struct mtd_info *mtd, struct otp_info *buf, size_t len);
 	int (*read_fact_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
-
-	/* This function is not yet implemented */
+	int (*get_user_prot_info) (struct mtd_info *mtd, struct otp_info *buf, size_t len);
+	int (*read_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
 	int (*write_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len, size_t *retlen, u_char *buf);
+	int (*lock_user_prot_reg) (struct mtd_info *mtd, loff_t from, size_t len);
 
 	/* kvec-based read/write methods. We need these especially for NAND flash,
 	   with its limited number of write cycles per erase.
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index a76ab898f445..091eb571e993 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -1,5 +1,5 @@
 /*
- * $Id: mtd-abi.h,v 1.7 2004/11/23 15:37:32 gleixner Exp $
+ * $Id: mtd-abi.h,v 1.8 2005/02/08 17:11:16 nico Exp $
  *
  * Portions of MTD ABI definition which are shared by kernel and user space 
  */
@@ -80,6 +80,12 @@ struct region_info_user {
 	uint32_t regionindex;
 };
 
+struct otp_info {
+	uint32_t start;
+	uint32_t length;
+	uint32_t locked;
+};
+
 #define MEMGETINFO              _IOR('M', 1, struct mtd_info_user)
 #define MEMERASE                _IOW('M', 2, struct erase_info_user)
 #define MEMWRITEOOB             _IOWR('M', 3, struct mtd_oob_buf)
-- 
cgit v1.2.3-59-g8ed1b


From 8f15fd55f9bf266139b10850947e19c4e3f4e9b7 Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Wed, 9 Feb 2005 09:17:45 +0000
Subject: [JFFS2] Add support for JFFS2-on-Dataflash devices.

For Dataflash, can_mark_obsolete = false and the NAND write buffering
code (wbuf.c) is used.

Since the DataFlash chip will automatically erase pages when writing,
the cleanmarkers are not needed - so cleanmarker_oob = false and
cleanmarker_size = 0

DataFlash page-sizes are not a power of two (they're multiples of 528
bytes).  The SECTOR_ADDR macro (added in the previous core patch) is
replaced with a (slower) div/mod version if CONFIG_JFFS2_FS_DATAFLASH is
selected.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/Kconfig                  |  7 +++++++
 fs/jffs2/Makefile           |  3 ++-
 fs/jffs2/erase.c            | 13 ++++++++++---
 fs/jffs2/fs.c               | 21 ++++++++++++++++++++-
 fs/jffs2/os-linux.h         | 18 ++++++++++++++++--
 fs/jffs2/scan.c             | 11 +++++++----
 fs/jffs2/wbuf.c             | 35 ++++++++++++++++++++++++++++++++---
 include/linux/jffs2_fs_sb.h |  4 ++--
 include/mtd/mtd-abi.h       |  3 ++-
 9 files changed, 98 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 6a4ad4bb7a54..07835d24c785 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1084,6 +1084,13 @@ config JFFS2_FS_NOR_ECC
           ECC for JFFS2. This type of flash chip is not common, however it is
           available from ST Microelectronics.
 
+config JFFS2_FS_DATAFLASH
+	bool "JFFS2 support for DataFlash (EXPERIMENTAL)"
+	depends on JFFS2_FS && EXPERIMENTAL
+	default n
+	help
+	  This enables the experimental support for JFFS2 on DataFlash devices.
+
 config JFFS2_COMPRESSION_OPTIONS
 	bool "Advanced compression options for JFFS2"
 	depends on JFFS2_FS
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index e3c38ccf9c7d..6c2ebe176b40 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Linux Journalling Flash File System v2 (JFFS2)
 #
-# $Id: Makefile.common,v 1.7 2004/11/03 12:57:38 jwboyer Exp $
+# $Id: Makefile.common,v 1.8 2005/02/09 09:17:40 pavlov Exp $
 #
 
 obj-$(CONFIG_JFFS2_FS) += jffs2.o
@@ -13,6 +13,7 @@ jffs2-y	+= super.o
 
 jffs2-$(CONFIG_JFFS2_FS_NAND)	+= wbuf.o
 jffs2-$(CONFIG_JFFS2_FS_NOR_ECC) += wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_DATAFLASH) += wbuf.o
 jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index ae858f878875..a3c6cc150497 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: erase.c,v 1.70 2005/02/09 09:09:01 pavlov Exp $
+ * $Id: erase.c,v 1.71 2005/02/09 09:17:40 pavlov Exp $
  *
  */
 
@@ -310,7 +310,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 	int ret;
 	uint32_t bad_offset;
 
-	if (!jffs2_cleanmarker_oob(c)) {
+	if ((!jffs2_cleanmarker_oob(c)) && (c->cleanmarker_size > 0)) {
 		marker_ref = jffs2_alloc_raw_node_ref();
 		if (!marker_ref) {
 			printk(KERN_WARNING "Failed to allocate raw node ref for clean marker\n");
@@ -351,7 +351,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 					bad_offset += i;
 					printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", datum, bad_offset);
 				bad: 
-					if (!jffs2_cleanmarker_oob(c))
+					if ((!jffs2_cleanmarker_oob(c)) && (c->cleanmarker_size > 0))
 						jffs2_free_raw_node_ref(marker_ref);
 					kfree(ebuf);
 				bad2:
@@ -383,6 +383,13 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
 			
 		jeb->first_node = jeb->last_node = NULL;
 
+		jeb->free_size = c->sector_size;
+		jeb->used_size = 0;
+		jeb->dirty_size = 0;
+		jeb->wasted_size = 0;
+	} else if (c->cleanmarker_size == 0) {
+		jeb->first_node = jeb->last_node = NULL;
+
 		jeb->free_size = c->sector_size;
 		jeb->used_size = 0;
 		jeb->dirty_size = 0;
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 30ab233fe423..5b7c960a0475 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: fs.c,v 1.51 2004/11/28 12:19:37 dedekind Exp $
+ * $Id: fs.c,v 1.52 2005/02/09 09:17:40 pavlov Exp $
  *
  */
 
@@ -456,6 +456,12 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 		return -EINVAL;
 	}
 #endif
+#ifndef CONFIG_JFFS2_FS_DATAFLASH
+	if (c->mtd->type == MTD_DATAFLASH) {
+		printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n");
+		return -EINVAL;
+	}
+#endif
 
 	c->flash_size = c->mtd->size;
 
@@ -661,6 +667,14 @@ static int jffs2_flash_setup(struct jffs2_sb_info *c) {
 		if (ret)
 			return ret;
 	}
+	
+	/* and Dataflash */
+	if (jffs2_dataflash(c)) {
+		ret = jffs2_dataflash_setup(c);
+		if (ret)
+			return ret;
+	}
+	
 	return ret;
 }
 
@@ -674,4 +688,9 @@ void jffs2_flash_cleanup(struct jffs2_sb_info *c) {
 	if (jffs2_nor_ecc(c)) {
 		jffs2_nor_ecc_flash_cleanup(c);
 	}
+	
+	/* and DataFlash */
+	if (jffs2_dataflash(c)) {
+		jffs2_dataflash_cleanup(c);
+	}
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 0412416d1f2d..af27b84007a1 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: os-linux.h,v 1.52 2005/02/09 09:09:01 pavlov Exp $
+ * $Id: os-linux.h,v 1.53 2005/02/09 09:17:41 pavlov Exp $
  *
  */
 
@@ -97,12 +97,16 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #endif
 }
 
+#ifdef CONFIG_JFFS2_FS_DATAFLASH
+#define SECTOR_ADDR(x) ( ((unsigned long)(x) / (unsigned long)(c->sector_size)) * c->sector_size )
+#else
 #define SECTOR_ADDR(x) ( ((unsigned long)(x) & ~(c->sector_size-1)) )
+#endif
 
 #define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
 #define jffs2_is_writebuffered(c) (c->wbuf != NULL)
 
-#if (!defined CONFIG_JFFS2_FS_NAND && !defined CONFIG_JFFS2_FS_NOR_ECC)
+#if (!defined CONFIG_JFFS2_FS_NAND && !defined CONFIG_JFFS2_FS_NOR_ECC && !defined CONFIG_JFFS2_FS_DATAFLASH)
 #define jffs2_can_mark_obsolete(c) (1)
 #define jffs2_cleanmarker_oob(c) (0)
 #define jffs2_write_nand_cleanmarker(c,jeb) (-EIO)
@@ -119,6 +123,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_wbuf_timeout NULL
 #define jffs2_wbuf_process NULL
 #define jffs2_nor_ecc(c) (0)
+#define jffs2_dataflash(c) (0)
 #define jffs2_nor_ecc_flash_setup(c) (0)
 #define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
 
@@ -154,6 +159,15 @@ void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c);
 #define jffs2_nor_ecc_flash_setup(c) (0)
 #define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
 #endif /* NOR ECC */
+#ifdef CONFIG_JFFS2_FS_DATAFLASH
+#define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
+int jffs2_dataflash_setup(struct jffs2_sb_info *c);
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
+#else
+#define jffs2_dataflash(c) (0)
+#define jffs2_dataflash_setup(c) (0)
+#define jffs2_dataflash_cleanup(c) do {} while (0)
+#endif /* DATAFLASH */
 #endif /* NAND */
 
 /* erase.c */
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 76859ff53437..e8c43746c82e 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: scan.c,v 1.116 2005/02/09 09:09:02 pavlov Exp $
+ * $Id: scan.c,v 1.117 2005/02/09 09:17:41 pavlov Exp $
  *
  */
 #include <linux/kernel.h>
@@ -68,7 +68,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 static inline int min_free(struct jffs2_sb_info *c)
 {
 	uint32_t min = 2 * sizeof(struct jffs2_raw_inode);
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC
+#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
 	if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize)
 		return c->wbuf_pagesize;
 #endif
@@ -228,7 +228,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		c->dirty_size -= c->nextblock->dirty_size;
 		c->nextblock->dirty_size = 0;
 	}
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC
+#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
 	if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size & (c->wbuf_pagesize-1))) {
 		/* If we're going to start writing into a block which already 
 		   contains data, and the end of the data isn't page-aligned,
@@ -351,7 +351,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 		}
 #endif
 		D1(printk(KERN_DEBUG "Block at 0x%08x is empty (erased)\n", jeb->offset));
-		return BLK_STATE_ALLFF;	/* OK to erase if all blocks are like this */
+		if (c->cleanmarker_size == 0)
+			return BLK_STATE_CLEANMARKER;	/* don't bother with re-erase */
+		else
+			return BLK_STATE_ALLFF;	/* OK to erase if all blocks are like this */
 	}
 	if (ofs) {
 		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 894dea88678d..a35e007e5bf8 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -9,7 +9,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: wbuf.c,v 1.87 2005/02/09 09:09:02 pavlov Exp $
+ * $Id: wbuf.c,v 1.88 2005/02/09 09:17:41 pavlov Exp $
  *
  */
 
@@ -435,7 +435,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	   if we have a switch to next page, we will not have
 	   enough remaining space for this. 
 	*/
-	if (pad) {
+	if (pad && !jffs2_dataflash(c)) {
 		c->wbuf_len = PAD(c->wbuf_len);
 
 		/* Pad with JFFS2_DIRTY_BITMASK initially.  this helps out ECC'd NOR
@@ -486,7 +486,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad)
 	spin_lock(&c->erase_completion_lock);
 
 	/* Adjust free size of the block if we padded. */
-	if (pad) {
+	if (pad && !jffs2_dataflash(c)) {
 		struct jffs2_eraseblock *jeb;
 
 		jeb = &c->blocks[c->wbuf_ofs / c->sector_size];
@@ -604,8 +604,14 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
 	return ret;
 }
 
+#ifdef CONFIG_JFFS2_FS_DATAFLASH
+#define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) )
+#define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) )
+#else
 #define PAGE_DIV(x) ( (x) & (~(c->wbuf_pagesize - 1)) )
 #define PAGE_MOD(x) ( (x) & (c->wbuf_pagesize - 1) )
+#endif
+
 int jffs2_flash_writev(struct jffs2_sb_info *c, const struct kvec *invecs, unsigned long count, loff_t to, size_t *retlen, uint32_t ino)
 {
 	struct kvec outvecs[3];
@@ -1192,6 +1198,29 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 	kfree(c->wbuf);
 }
 
+#ifdef CONFIG_JFFS2_FS_DATAFLASH
+int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
+	c->cleanmarker_size = 0;		/* No cleanmarkers needed */
+	
+	/* Initialize write buffer */
+	init_rwsem(&c->wbuf_sem);
+	c->wbuf_pagesize = c->sector_size;
+	c->wbuf_ofs = 0xFFFFFFFF;
+
+	c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+	if (!c->wbuf)
+		return -ENOMEM;
+
+	printk(KERN_INFO "JFFS2 write-buffering enabled (%i)\n", c->wbuf_pagesize);
+
+	return 0;
+}
+
+void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
+	kfree(c->wbuf);
+}
+#endif
+
 #ifdef CONFIG_JFFS2_FS_NOR_ECC
 int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
 	/* Cleanmarker is actually larger on the flashes */
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
index 4afc8d8c2e9e..faec29559fed 100644
--- a/include/linux/jffs2_fs_sb.h
+++ b/include/linux/jffs2_fs_sb.h
@@ -1,4 +1,4 @@
-/* $Id: jffs2_fs_sb.h,v 1.48 2004/11/20 10:41:12 dwmw2 Exp $ */
+/* $Id: jffs2_fs_sb.h,v 1.49 2005/02/09 09:17:41 pavlov Exp $ */
 
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
@@ -94,7 +94,7 @@ struct jffs2_sb_info {
 	   to an obsoleted node. I don't like this. Alternatives welcomed. */
 	struct semaphore erase_free_sem;
 
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC
+#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
 	/* Write-behind buffer for NAND flash */
 	unsigned char *wbuf;
 	uint32_t wbuf_ofs;
diff --git a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
index c984cb2c9413..cacb9842b195 100644
--- a/include/mtd/mtd-abi.h
+++ b/include/mtd/mtd-abi.h
@@ -1,5 +1,5 @@
 /*
- * $Id: mtd-abi.h,v 1.9 2005/02/08 17:45:52 nico Exp $
+ * $Id: mtd-abi.h,v 1.10 2005/02/09 09:17:42 pavlov Exp $
  *
  * Portions of MTD ABI definition which are shared by kernel and user space 
  */
@@ -29,6 +29,7 @@ struct mtd_oob_buf {
 #define MTD_NORFLASH		3
 #define MTD_NANDFLASH		4
 #define MTD_PEROM		5
+#define MTD_DATAFLASH		6
 #define MTD_OTHER		14
 #define MTD_UNKNOWN		15
 
-- 
cgit v1.2.3-59-g8ed1b


From 2f82ce1eb637c06dfc60f095cd1891ae0ba4894c Mon Sep 17 00:00:00 2001
From: Andrew Victor <andrew@sanpeople.com>
Date: Wed, 9 Feb 2005 09:24:26 +0000
Subject: [JFFS2] Use a single config option for write buffer support

This patch replaces the current CONFIG_JFFS2_FS_NAND, CONFIG_JFFS2_FS_NOR_ECC
and CONFIG_JFFS2_FS_DATAFLASH with a single configuration option -
CONFIG_JFFS2_FS_WRITEBUFFER.

The only functional change of this patch is that the slower div/mod
calculations for SECTOR_ADDR(), PAGE_DIV() and PAGE_MOD() are now always
used when CONFIG_JFFS2_FS_WRITEBUFFER is enabled.

Signed-off-by: Andrew Victor <andrew@sanpeople.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/Kconfig                  | 33 +++++++++------------------------
 fs/jffs2/Makefile           |  6 ++----
 fs/jffs2/fs.c               |  6 ++----
 fs/jffs2/nodelist.h         |  4 ++--
 fs/jffs2/os-linux.h         | 28 ++++++++--------------------
 fs/jffs2/scan.c             | 12 ++++++------
 fs/jffs2/super.c            |  4 ++--
 fs/jffs2/wbuf.c             |  8 ++------
 include/linux/jffs2_fs_sb.h |  4 ++--
 9 files changed, 35 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 07835d24c785..475769c25d64 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1063,33 +1063,18 @@ config JFFS2_FS_DEBUG
 	  If reporting bugs, please try to have available a full dump of the
 	  messages at debug level 1 while the misbehaviour was occurring.
 
-config JFFS2_FS_NAND
-	bool "JFFS2 support for NAND flash"
+config JFFS2_FS_WRITEBUFFER
+	bool "JFFS2 write-buffering support"
 	depends on JFFS2_FS
-	default n
+	default y
 	help
-	  This enables the support for NAND flash in JFFS2. NAND is a newer
-	  type of flash chip design than the traditional NOR flash, with
-	  higher density but a handful of characteristics which make it more
-	  interesting for the file system to use.
+	  This enables the write-buffering support in JFFS2.
 
-	  Say 'N' unless you have NAND flash.
-
-config JFFS2_FS_NOR_ECC
-        bool "JFFS2 support for ECC'd NOR flash (EXPERIMENTAL)"
-        depends on JFFS2_FS && EXPERIMENTAL
-        default n
-        help
-          This enables the experimental support for NOR flash with transparent
-          ECC for JFFS2. This type of flash chip is not common, however it is
-          available from ST Microelectronics.
-
-config JFFS2_FS_DATAFLASH
-	bool "JFFS2 support for DataFlash (EXPERIMENTAL)"
-	depends on JFFS2_FS && EXPERIMENTAL
-	default n
-	help
-	  This enables the experimental support for JFFS2 on DataFlash devices.
+	  This functionality is required to support JFFS2 on the following
+	  types of flash devices:
+	    - NAND flash
+	    - NOR flash with transparent ECC
+	    - DataFlash
 
 config JFFS2_COMPRESSION_OPTIONS
 	bool "Advanced compression options for JFFS2"
diff --git a/fs/jffs2/Makefile b/fs/jffs2/Makefile
index 6c2ebe176b40..f1afe681ecd6 100644
--- a/fs/jffs2/Makefile
+++ b/fs/jffs2/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the Linux Journalling Flash File System v2 (JFFS2)
 #
-# $Id: Makefile.common,v 1.8 2005/02/09 09:17:40 pavlov Exp $
+# $Id: Makefile.common,v 1.9 2005/02/09 09:23:53 pavlov Exp $
 #
 
 obj-$(CONFIG_JFFS2_FS) += jffs2.o
@@ -11,9 +11,7 @@ jffs2-y	+= read.o nodemgmt.o readinode.o write.o scan.o gc.o
 jffs2-y	+= symlink.o build.o erase.o background.o fs.o writev.o
 jffs2-y	+= super.o
 
-jffs2-$(CONFIG_JFFS2_FS_NAND)	+= wbuf.o
-jffs2-$(CONFIG_JFFS2_FS_NOR_ECC) += wbuf.o
-jffs2-$(CONFIG_JFFS2_FS_DATAFLASH) += wbuf.o
+jffs2-$(CONFIG_JFFS2_FS_WRITEBUFFER)	+= wbuf.o
 jffs2-$(CONFIG_JFFS2_RUBIN)	+= compr_rubin.o
 jffs2-$(CONFIG_JFFS2_RTIME)	+= compr_rtime.o
 jffs2-$(CONFIG_JFFS2_ZLIB)	+= compr_zlib.o
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 5b7c960a0475..c91c66e5e869 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: fs.c,v 1.52 2005/02/09 09:17:40 pavlov Exp $
+ * $Id: fs.c,v 1.53 2005/02/09 09:23:53 pavlov Exp $
  *
  */
 
@@ -450,13 +450,11 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 
 	c = JFFS2_SB_INFO(sb);
 
-#ifndef CONFIG_JFFS2_FS_NAND
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (c->mtd->type == MTD_NANDFLASH) {
 		printk(KERN_ERR "jffs2: Cannot operate on NAND flash unless jffs2 NAND support is compiled in.\n");
 		return -EINVAL;
 	}
-#endif
-#ifndef CONFIG_JFFS2_FS_DATAFLASH
 	if (c->mtd->type == MTD_DATAFLASH) {
 		printk(KERN_ERR "jffs2: Cannot operate on DataFlash unless jffs2 DataFlash support is compiled in.\n");
 		return -EINVAL;
diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h
index a4864d05ea92..8c122838bf6d 100644
--- a/fs/jffs2/nodelist.h
+++ b/fs/jffs2/nodelist.h
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: nodelist.h,v 1.126 2004/11/19 15:06:29 dedekind Exp $
+ * $Id: nodelist.h,v 1.127 2005/02/09 09:23:53 pavlov Exp $
  *
  */
 
@@ -462,7 +462,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c);
 /* erase.c */
 void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count);
 
-#ifdef CONFIG_JFFS2_FS_NAND
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 /* wbuf.c */
 int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
 int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index af27b84007a1..8989cd685e46 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: os-linux.h,v 1.53 2005/02/09 09:17:41 pavlov Exp $
+ * $Id: os-linux.h,v 1.54 2005/02/09 09:23:53 pavlov Exp $
  *
  */
 
@@ -97,16 +97,12 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #endif
 }
 
-#ifdef CONFIG_JFFS2_FS_DATAFLASH
-#define SECTOR_ADDR(x) ( ((unsigned long)(x) / (unsigned long)(c->sector_size)) * c->sector_size )
-#else
-#define SECTOR_ADDR(x) ( ((unsigned long)(x) & ~(c->sector_size-1)) )
-#endif
 
 #define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY)
 #define jffs2_is_writebuffered(c) (c->wbuf != NULL)
 
-#if (!defined CONFIG_JFFS2_FS_NAND && !defined CONFIG_JFFS2_FS_NOR_ECC && !defined CONFIG_JFFS2_FS_DATAFLASH)
+#ifndef CONFIG_JFFS2_FS_WRITEBUFFER
+#define SECTOR_ADDR(x) ( ((unsigned long)(x) & ~(c->sector_size-1)) )
 #define jffs2_can_mark_obsolete(c) (1)
 #define jffs2_cleanmarker_oob(c) (0)
 #define jffs2_write_nand_cleanmarker(c,jeb) (-EIO)
@@ -129,6 +125,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 
 #else /* NAND and/or ECC'd NOR support present */
 
+#define SECTOR_ADDR(x) ( ((unsigned long)(x) / (unsigned long)(c->sector_size)) * c->sector_size )
 #define jffs2_can_mark_obsolete(c) ((c->mtd->type == MTD_NORFLASH && !(c->mtd->flags & MTD_ECC)) || c->mtd->type == MTD_RAM)
 #define jffs2_cleanmarker_oob(c) (c->mtd->type == MTD_NANDFLASH)
 
@@ -150,25 +147,16 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino);
 int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c);
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c);
-#ifdef CONFIG_JFFS2_FS_NOR_ECC
+
 #define jffs2_nor_ecc(c) (c->mtd->type == MTD_NORFLASH && (c->mtd->flags & MTD_ECC))
 int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c);
-#else
-#define jffs2_nor_ecc(c) (0)
-#define jffs2_nor_ecc_flash_setup(c) (0)
-#define jffs2_nor_ecc_flash_cleanup(c) do {} while (0)
-#endif /* NOR ECC */
-#ifdef CONFIG_JFFS2_FS_DATAFLASH
+
 #define jffs2_dataflash(c) (c->mtd->type == MTD_DATAFLASH)
 int jffs2_dataflash_setup(struct jffs2_sb_info *c);
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c);
-#else
-#define jffs2_dataflash(c) (0)
-#define jffs2_dataflash_setup(c) (0)
-#define jffs2_dataflash_cleanup(c) do {} while (0)
-#endif /* DATAFLASH */
-#endif /* NAND */
+
+#endif /* WRITEBUFFER */
 
 /* erase.c */
 static inline void jffs2_erase_pending_trigger(struct jffs2_sb_info *c)
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index e8c43746c82e..bc6c99980026 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: scan.c,v 1.117 2005/02/09 09:17:41 pavlov Exp $
+ * $Id: scan.c,v 1.118 2005/02/09 09:23:53 pavlov Exp $
  *
  */
 #include <linux/kernel.h>
@@ -68,7 +68,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
 static inline int min_free(struct jffs2_sb_info *c)
 {
 	uint32_t min = 2 * sizeof(struct jffs2_raw_inode);
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (!jffs2_can_mark_obsolete(c) && min < c->wbuf_pagesize)
 		return c->wbuf_pagesize;
 #endif
@@ -228,7 +228,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c)
 		c->dirty_size -= c->nextblock->dirty_size;
 		c->nextblock->dirty_size = 0;
 	}
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size & (c->wbuf_pagesize-1))) {
 		/* If we're going to start writing into a block which already 
 		   contains data, and the end of the data isn't page-aligned,
@@ -294,7 +294,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 	uint32_t hdr_crc, buf_ofs, buf_len;
 	int err;
 	int noise = 0;
-#ifdef CONFIG_JFFS2_FS_NAND
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	int cleanmarkerfound = 0;
 #endif
 
@@ -303,7 +303,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 
 	D1(printk(KERN_DEBUG "jffs2_scan_eraseblock(): Scanning block at 0x%x\n", ofs));
 
-#ifdef CONFIG_JFFS2_FS_NAND
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	if (jffs2_cleanmarker_oob(c)) {
 		int ret = jffs2_check_nand_cleanmarker(c, jeb);
 		D2(printk(KERN_NOTICE "jffs_check_nand_cleanmarker returned %d\n",ret));
@@ -338,7 +338,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
 		ofs += 4;
 
 	if (ofs == EMPTY_SCAN_SIZE(c->sector_size)) {
-#ifdef CONFIG_JFFS2_FS_NAND
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 		if (jffs2_cleanmarker_oob(c)) {
 			/* scan oob, take care of cleanmarker */
 			int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 6b2a441d2766..3bfc121a4674 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: super.c,v 1.104 2004/11/23 15:37:31 gleixner Exp $
+ * $Id: super.c,v 1.105 2005/02/09 09:23:54 pavlov Exp $
  *
  */
 
@@ -309,7 +309,7 @@ static int __init init_jffs2_fs(void)
 	int ret;
 
 	printk(KERN_INFO "JFFS2 version 2.2."
-#ifdef CONFIG_JFFS2_FS_NAND
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	       " (NAND)"
 #endif
 	       " (C) 2001-2003 Red Hat, Inc.\n");
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a35e007e5bf8..890258505a7f 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -9,7 +9,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: wbuf.c,v 1.88 2005/02/09 09:17:41 pavlov Exp $
+ * $Id: wbuf.c,v 1.89 2005/02/09 09:23:54 pavlov Exp $
  *
  */
 
@@ -604,7 +604,7 @@ int jffs2_flush_wbuf_pad(struct jffs2_sb_info *c)
 	return ret;
 }
 
-#ifdef CONFIG_JFFS2_FS_DATAFLASH
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 #define PAGE_DIV(x) ( ((unsigned long)(x) / (unsigned long)(c->wbuf_pagesize)) * (unsigned long)(c->wbuf_pagesize) )
 #define PAGE_MOD(x) ( (unsigned long)(x) % (unsigned long)(c->wbuf_pagesize) )
 #else
@@ -1198,7 +1198,6 @@ void jffs2_nand_flash_cleanup(struct jffs2_sb_info *c)
 	kfree(c->wbuf);
 }
 
-#ifdef CONFIG_JFFS2_FS_DATAFLASH
 int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
 	c->cleanmarker_size = 0;		/* No cleanmarkers needed */
 	
@@ -1219,9 +1218,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
 void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) {
 	kfree(c->wbuf);
 }
-#endif
 
-#ifdef CONFIG_JFFS2_FS_NOR_ECC
 int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
 	/* Cleanmarker is actually larger on the flashes */
 	c->cleanmarker_size = 16;
@@ -1241,4 +1238,3 @@ int jffs2_nor_ecc_flash_setup(struct jffs2_sb_info *c) {
 void jffs2_nor_ecc_flash_cleanup(struct jffs2_sb_info *c) {
 	kfree(c->wbuf);
 }
-#endif
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
index faec29559fed..1bd6cdfb7d78 100644
--- a/include/linux/jffs2_fs_sb.h
+++ b/include/linux/jffs2_fs_sb.h
@@ -1,4 +1,4 @@
-/* $Id: jffs2_fs_sb.h,v 1.49 2005/02/09 09:17:41 pavlov Exp $ */
+/* $Id: jffs2_fs_sb.h,v 1.50 2005/02/09 09:23:55 pavlov Exp $ */
 
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
@@ -94,7 +94,7 @@ struct jffs2_sb_info {
 	   to an obsoleted node. I don't like this. Alternatives welcomed. */
 	struct semaphore erase_free_sem;
 
-#if defined CONFIG_JFFS2_FS_NAND || defined CONFIG_JFFS2_FS_NOR_ECC || defined CONFIG_JFFS2_FS_DATAFLASH
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* Write-behind buffer for NAND flash */
 	unsigned char *wbuf;
 	uint32_t wbuf_ofs;
-- 
cgit v1.2.3-59-g8ed1b


From 0040bf382c77414739c933e4d2ee35ff817d0b99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 9 Feb 2005 12:20:00 +0000
Subject: [MTD] NAND: Skip bad block table scan on request

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 6 +++++-
 include/linux/mtd/nand.h     | 5 +++--
 2 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7094dd5716dc..99abd615a467 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -59,7 +59,7 @@
  *	The AG-AND chips have nice features for speed improvement,
  *	which are not supported yet. Read / program 4 pages in one go.
  *
- * $Id: nand_base.c,v 1.130 2005/01/24 03:07:43 dmarlin Exp $
+ * $Id: nand_base.c,v 1.131 2005/02/09 12:19:56 gleixner Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -2631,6 +2631,10 @@ int nand_scan (struct mtd_info *mtd, int maxchips)
 	memcpy(&mtd->oobinfo, this->autooob, sizeof(mtd->oobinfo));
 
 	mtd->owner = THIS_MODULE;
+	
+	/* Check, if we should skip the bad block table scan */
+	if (this->options & NAND_SKIP_BBTSCAN)
+		return 0;
 
 	/* Build bad block table */
 	return this->scan_bbt (mtd);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index cf52f20c6de2..cf25c7cfd0ba 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -5,7 +5,7 @@
  *                     Steven J. Hill <sjhill@realitydiluted.com>
  *		       Thomas Gleixner <tglx@linutronix.de>
  *
- * $Id: nand.h,v 1.70 2005/01/24 03:07:42 dmarlin Exp $
+ * $Id: nand.h,v 1.71 2005/02/09 12:12:59 gleixner Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -223,7 +223,8 @@ extern int nand_read_raw (struct mtd_info *mtd, uint8_t *buf, loff_t from, size_
  * This can only work if we have the ecc bytes directly behind the 
  * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
 #define NAND_HWECC_SYNDROME	0x00020000
-
+/* This option skips the bbt scan during initialization. */
+#define NAND_SKIP_BBTSCAN	0x00040000
 
 /* Options set by nand scan */
 /* Nand scan has allocated oob_buf */
-- 
cgit v1.2.3-59-g8ed1b


From f16407d73effc59e1e9f88e45a3dc53cacbb8264 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Wed, 16 Feb 2005 15:55:03 +0000
Subject: [MTD] Quiet unused variable warning

Signed-off-by: Nioclas Pitre <nico@cam.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/map.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 8fc6679aa9b1..115b14a634da 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,6 +1,6 @@
 
 /* Overhauled routines for dealing with different mmap regions of flash */
-/* $Id: map.h,v 1.47 2005/02/08 17:11:15 nico Exp $ */
+/* $Id: map.h,v 1.48 2005/02/16 15:54:59 nico Exp $ */
 
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
@@ -418,7 +418,7 @@ extern void simple_map_init(struct map_info *);
 
 
 #define simple_map_init(map) BUG_ON(!map_bankwidth_supported((map)->bankwidth))
-#define map_is_linear(map) (1)
+#define map_is_linear(map) ({ (void)(map); 1; })
 
 #endif /* !CONFIG_MTD_COMPLEX_MAPPINGS */
 
-- 
cgit v1.2.3-59-g8ed1b


From 31fbdf7aa5aac8a2a34f180a25deb157297a10c9 Mon Sep 17 00:00:00 2001
From: "Artem B. Bityuckiy" <dedekind@infradead.org>
Date: Mon, 28 Feb 2005 08:21:09 +0000
Subject: [JFFS2] Fix NOR specific scan BUG

Fix fairly sad NOR-specific bug - during FS building ic->scan_dents
isn't zero, but jffs2_mark_node_obsolete() migt be called it tries to
finde the ic corresponding to ref - this requires ic->scan_dents = 0.

Signed-off-by: Artem B. Bityuckiy <dedekind@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/build.c            |  9 ++++++---
 fs/jffs2/nodemgmt.c         | 11 ++++++-----
 include/linux/jffs2_fs_sb.h |  5 +++--
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index a01dd5fdbb95..3dd5394921c9 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: build.c,v 1.69 2004/12/16 20:22:18 dmarlin Exp $
+ * $Id: build.c,v 1.70 2005/02/28 08:21:05 dedekind Exp $
  *
  */
 
@@ -97,14 +97,16 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 	/* First, scan the medium and build all the inode caches with
 	   lists of physical nodes */
 
-	c->flags |= JFFS2_SB_FLAG_MOUNTING;
+	c->flags |= JFFS2_SB_FLAG_SCANNING;
 	ret = jffs2_scan_medium(c);
+	c->flags &= ~JFFS2_SB_FLAG_SCANNING;
 	if (ret)
 		goto exit;
 
 	D1(printk(KERN_DEBUG "Scanned flash completely\n"));
 	D2(jffs2_dump_block_lists(c));
 
+	c->flags |= JFFS2_SB_FLAG_BUILDING;
 	/* Now scan the directory tree, increasing nlink according to every dirent found. */
 	for_each_inode(i, c, ic) {
 		D1(printk(KERN_DEBUG "Pass 1: ino #%u\n", ic->ino));
@@ -116,7 +118,6 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 			cond_resched();
 		}
 	}
-	c->flags &= ~JFFS2_SB_FLAG_MOUNTING;
 
 	D1(printk(KERN_DEBUG "Pass 1 complete\n"));
 
@@ -164,6 +165,8 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
 		ic->scan_dents = NULL;
 		cond_resched();
 	}
+	c->flags &= ~JFFS2_SB_FLAG_BUILDING;
+	
 	D1(printk(KERN_DEBUG "Pass 3 complete\n"));
 	D2(jffs2_dump_block_lists(c));
 
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index f9dcac1415ac..456adf020f22 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: nodemgmt.c,v 1.118 2005/02/27 23:01:32 dwmw2 Exp $
+ * $Id: nodemgmt.c,v 1.119 2005/02/28 08:21:05 dedekind Exp $
  *
  */
 
@@ -403,7 +403,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 	jeb = &c->blocks[blocknr];
 
 	if (jffs2_can_mark_obsolete(c) && !jffs2_is_readonly(c) &&
-	    !(c->flags & JFFS2_SB_FLAG_MOUNTING)) {
+	    !(c->flags & (JFFS2_SB_FLAG_SCANNING | JFFS2_SB_FLAG_BUILDING))) {
 		/* Hm. This may confuse static lock analysis. If any of the above 
 		   three conditions is false, we're going to return from this 
 		   function without actually obliterating any nodes or freeing
@@ -470,8 +470,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	D1(ACCT_PARANOIA_CHECK(jeb));
 
-	if (c->flags & JFFS2_SB_FLAG_MOUNTING) {
-		/* Mount in progress. Don't muck about with the block
+	if (c->flags & JFFS2_SB_FLAG_SCANNING) {
+		/* Flash scanning is in progress. Don't muck about with the block
 		   lists because they're not ready yet, and don't actually
 		   obliterate nodes that look obsolete. If they weren't 
 		   marked obsolete on the flash at the time they _became_
@@ -530,7 +530,8 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref
 
 	spin_unlock(&c->erase_completion_lock);
 
-	if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c)) {
+	if (!jffs2_can_mark_obsolete(c) || jffs2_is_readonly(c) ||
+		(c->flags & JFFS2_SB_FLAG_BUILDING)) {
 		/* We didn't lock the erase_free_sem */
 		return;
 	}
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
index 1bd6cdfb7d78..350b82bd6529 100644
--- a/include/linux/jffs2_fs_sb.h
+++ b/include/linux/jffs2_fs_sb.h
@@ -1,4 +1,4 @@
-/* $Id: jffs2_fs_sb.h,v 1.50 2005/02/09 09:23:55 pavlov Exp $ */
+/* $Id: jffs2_fs_sb.h,v 1.51 2005/02/28 08:21:06 dedekind Exp $ */
 
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
@@ -14,7 +14,8 @@
 #include <linux/rwsem.h>
 
 #define JFFS2_SB_FLAG_RO 1
-#define JFFS2_SB_FLAG_MOUNTING 2
+#define JFFS2_SB_FLAG_SCANNING 2 /* Flash scanning is in progress */
+#define JFFS2_SB_FLAG_BUILDING 4 /* File system building is in progress */
 
 struct jffs2_inodirty;
 
-- 
cgit v1.2.3-59-g8ed1b


From 0514cd938009de1d6b3239d98c3cf2a67b620103 Mon Sep 17 00:00:00 2001
From: Ben Dooks <ben@simtec.co.uk>
Date: Mon, 14 Mar 2005 18:27:18 +0000
Subject: [MTD] Fixed signed 1bit bitfield

Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/flashchip.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/flashchip.h b/include/linux/mtd/flashchip.h
index e778a1ab23c4..675776fa3e27 100644
--- a/include/linux/mtd/flashchip.h
+++ b/include/linux/mtd/flashchip.h
@@ -6,7 +6,7 @@
  *
  * (C) 2000 Red Hat. GPLd.
  *
- * $Id: flashchip.h,v 1.16 2005/02/08 17:11:15 nico Exp $
+ * $Id: flashchip.h,v 1.17 2005/03/14 18:27:15 bjd Exp $
  *
  */
 
@@ -63,8 +63,8 @@ struct flchip {
 	flstate_t state;
 	flstate_t oldstate;
 
-	int write_suspended:1;
-	int erase_suspended:1;
+	unsigned int write_suspended:1;
+	unsigned int erase_suspended:1;
 	unsigned long in_progress_block_addr;
 
 	spinlock_t *mutex;
-- 
cgit v1.2.3-59-g8ed1b


From c927cd3a226bed5cf063cdf04de13cef51144cef Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 15 Mar 2005 19:03:16 +0000
Subject: [MTD] Add the reverse operation of cfi_build_cmd()

This is necessary to fix the broken status check in cfi_cmdset_0001

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/cfi.h | 65 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 64 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 76255474a27c..66e0a32efbac 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -1,7 +1,7 @@
 
 /* Common Flash Interface structures 
  * See http://support.intel.com/design/flash/technote/index.htm
- * $Id: cfi.h,v 1.52 2005/02/08 17:11:15 nico Exp $
+ * $Id: cfi.h,v 1.53 2005/03/15 19:03:13 gleixner Exp $
  */
 
 #ifndef __MTD_CFI_H__
@@ -315,6 +315,69 @@ static inline map_word cfi_build_cmd(u_long cmd, struct map_info *map, struct cf
 }
 #define CMD(x)  cfi_build_cmd((x), map, cfi)
 
+
+static inline unsigned char cfi_merge_status(map_word val, struct map_info *map, 
+					   struct cfi_private *cfi)
+{
+	int wordwidth, words_per_bus, chip_mode, chips_per_word;
+	unsigned long onestat, res = 0;
+	int i;
+
+	/* We do it this way to give the compiler a fighting chance 
+	   of optimising away all the crap for 'bankwidth' larger than
+	   an unsigned long, in the common case where that support is
+	   disabled */
+	if (map_bankwidth_is_large(map)) {
+		wordwidth = sizeof(unsigned long);
+		words_per_bus = (map_bankwidth(map)) / wordwidth; // i.e. normally 1
+	} else {
+		wordwidth = map_bankwidth(map);
+		words_per_bus = 1;
+	}
+	
+	chip_mode = map_bankwidth(map) / cfi_interleave(cfi);
+	chips_per_word = wordwidth * cfi_interleave(cfi) / map_bankwidth(map);
+
+	onestat = val.x[0];
+	/* Or all status words together */
+	for (i=1; i < words_per_bus; i++) {
+		onestat |= val.x[i];
+	}
+
+	res = onestat;
+	switch(chips_per_word) {
+	default: BUG();
+#if BITS_PER_LONG >= 64
+	case 8:
+		res |= (onestat >> (chip_mode * 32));
+#endif
+	case 4:
+		res |= (onestat >> (chip_mode * 16));
+	case 2:
+		res |= (onestat >> (chip_mode * 8));
+	case 1:
+		;
+	}
+
+	/* Last, determine what the bit-pattern should be for a single
+	   device, according to chip mode and endianness... */
+	switch (chip_mode) {
+	case 1:
+		break;
+	case 2:
+		res = cfi16_to_cpu(res);
+		break;
+	case 4:
+		res = cfi32_to_cpu(res);
+		break;
+	default: BUG();
+	}
+	return res;
+}
+
+#define MERGESTATUS(x) cfi_merge_status((x), map, cfi)
+
+
 /*
  * Sends a CFI command to a bank of flash for the given geometry.
  *
-- 
cgit v1.2.3-59-g8ed1b


From 963a6fb0a0d336d0513083b7e4b5c3ff9d6d2061 Mon Sep 17 00:00:00 2001
From: Nicolas Pitre <nico@cam.org>
Date: Fri, 1 Apr 2005 02:59:56 +0100
Subject: [MTD] Add reboot notifier to Intel NOR flash driver

to make sure the flash is in array mode whenever we're about to
reboot. This is especially useful to allow "soft" reboot to work
which consists of branching back into the bootloader.

Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/chips/cfi_cmdset_0001.c | 45 +++++++++++++++++++++++++++++++++++--
 include/linux/mtd/mtd.h             |  5 ++++-
 2 files changed, 47 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index b482a4e48e48..dc257eb6932f 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -4,7 +4,7 @@
  *
  * (C) 2000 Red Hat. GPL'd
  *
- * $Id: cfi_cmdset_0001.c,v 1.173 2005/03/30 23:57:30 tpoynor Exp $
+ * $Id: cfi_cmdset_0001.c,v 1.174 2005/04/01 01:59:52 nico Exp $
  *
  * 
  * 10/10/2000	Nicolas Pitre <nico@cam.org>
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/reboot.h>
 #include <linux/mtd/xip.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
@@ -66,6 +67,7 @@ static int cfi_intelext_get_user_prot_info (struct mtd_info *,
 #endif
 static int cfi_intelext_suspend (struct mtd_info *);
 static void cfi_intelext_resume (struct mtd_info *);
+static int cfi_intelext_reboot (struct notifier_block *, unsigned long, void *);
 
 static void cfi_intelext_destroy(struct mtd_info *);
 
@@ -333,7 +335,9 @@ struct mtd_info *cfi_cmdset_0001(struct map_info *map, int primary)
 	mtd->resume  = cfi_intelext_resume;
 	mtd->flags   = MTD_CAP_NORFLASH;
 	mtd->name    = map->name;
-	
+
+	mtd->reboot_notifier.notifier_call = cfi_intelext_reboot;
+
 	if (cfi->cfi_mode == CFI_MODE_CFI) {
 		/* 
 		 * It's a real CFI chip, not one for which the probe
@@ -446,6 +450,7 @@ static struct mtd_info *cfi_intelext_setup(struct mtd_info *mtd)
 		goto setup_err;
 
 	__module_get(THIS_MODULE);
+	register_reboot_notifier(&mtd->reboot_notifier);
 	return mtd;
 
  setup_err:
@@ -2301,10 +2306,46 @@ static void cfi_intelext_resume(struct mtd_info *mtd)
 	}
 }
 
+static int cfi_intelext_reset(struct mtd_info *mtd)
+{
+	struct map_info *map = mtd->priv;
+	struct cfi_private *cfi = map->fldrv_priv;
+	int i, ret;
+
+	for (i=0; i < cfi->numchips; i++) {
+		struct flchip *chip = &cfi->chips[i];
+
+		/* force the completion of any ongoing operation
+		   and switch to array mode so any bootloader in 
+		   flash is accessible for soft reboot. */
+		spin_lock(chip->mutex);
+		ret = get_chip(map, chip, chip->start, FL_SYNCING);
+		if (!ret) {
+			map_write(map, CMD(0xff), chip->start);
+			chip->state = FL_READY;
+		}
+		spin_unlock(chip->mutex);
+	}
+
+	return 0;
+}
+
+static int cfi_intelext_reboot(struct notifier_block *nb, unsigned long val,
+			       void *v)
+{
+	struct mtd_info *mtd;
+
+	mtd = container_of(nb, struct mtd_info, reboot_notifier);
+	cfi_intelext_reset(mtd);
+	return NOTIFY_DONE;
+}
+
 static void cfi_intelext_destroy(struct mtd_info *mtd)
 {
 	struct map_info *map = mtd->priv;
 	struct cfi_private *cfi = map->fldrv_priv;
+	cfi_intelext_reset(mtd);
+	unregister_reboot_notifier(&mtd->reboot_notifier);
 	kfree(cfi->cmdset_priv);
 	kfree(cfi->cfiq);
 	kfree(cfi->chips[0].priv);
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index 3aab1b8729e0..f574cd498816 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -1,5 +1,5 @@
 /* 
- * $Id: mtd.h,v 1.57 2005/02/08 17:11:15 nico Exp $
+ * $Id: mtd.h,v 1.58 2005/04/01 01:59:54 nico Exp $
  *
  * Copyright (C) 1999-2003 David Woodhouse <dwmw2@infradead.org> et al.
  *
@@ -18,6 +18,7 @@
 #include <linux/types.h>
 #include <linux/module.h>
 #include <linux/uio.h>
+#include <linux/notifier.h>
 
 #include <linux/mtd/compatmac.h>
 #include <mtd/mtd-abi.h>
@@ -147,6 +148,8 @@ struct mtd_info {
 	int (*block_isbad) (struct mtd_info *mtd, loff_t ofs);
 	int (*block_markbad) (struct mtd_info *mtd, loff_t ofs);
 
+	struct notifier_block reboot_notifier;  /* default mode before reboot */
+
 	void *priv;
 
 	struct module *owner;
-- 
cgit v1.2.3-59-g8ed1b


From 65c6e0a657012d104fe42be5f01a7b9b451b687c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 11 Apr 2005 11:19:05 +0100
Subject: [MTD] Fix broken user ABI

Move kernel data where it belongs. Previous change broke user abi.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/mtd.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index f574cd498816..c50c3f3927d9 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -1,5 +1,5 @@
 /* 
- * $Id: mtd.h,v 1.58 2005/04/01 01:59:54 nico Exp $
+ * $Id: mtd.h,v 1.59 2005/04/11 10:19:02 gleixner Exp $
  *
  * Copyright (C) 1999-2003 David Woodhouse <dwmw2@infradead.org> et al.
  *
@@ -70,7 +70,6 @@ struct mtd_info {
 
 	u_int32_t oobblock;  // Size of OOB blocks (e.g. 512)
 	u_int32_t oobsize;   // Amount of OOB data per block (e.g. 16)
-	u_int32_t oobavail;  // Number of bytes in OOB area available for fs 
 	u_int32_t ecctype;
 	u_int32_t eccsize;
 	
@@ -81,6 +80,7 @@ struct mtd_info {
 
 	// oobinfo is a nand_oobinfo structure, which can be set by iotcl (MEMSETOOBINFO)
 	struct nand_oobinfo oobinfo;
+	u_int32_t oobavail;  // Number of bytes in OOB area available for fs 
 
 	/* Data for variable erase regions. If numeraseregions is zero,
 	 * it means that the whole device has erasesize as given above. 
-- 
cgit v1.2.3-59-g8ed1b


From fff7afd791f6a685b3ddedb8cfb152aed85f3cf8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 19 May 2005 17:18:11 +0100
Subject: [JFFS2] Convert thread start semaphore to completion

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/jffs2/background.c       | 8 ++++----
 include/linux/jffs2_fs_sb.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 1be6de27dd81..5548749bacb6 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -7,7 +7,7 @@
  *
  * For licensing information, see the file 'LICENCE' in this directory.
  *
- * $Id: background.c,v 1.50 2004/11/16 20:36:10 dwmw2 Exp $
+ * $Id: background.c,v 1.52 2005/05/19 16:18:08 gleixner Exp $
  *
  */
 
@@ -37,7 +37,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
 	if (c->gc_task)
 		BUG();
 
-	init_MUTEX_LOCKED(&c->gc_thread_start);
+	init_completion(&c->gc_thread_start);
 	init_completion(&c->gc_thread_exit);
 
 	pid = kernel_thread(jffs2_garbage_collect_thread, c, CLONE_FS|CLONE_FILES);
@@ -48,7 +48,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c)
 	} else {
 		/* Wait for it... */
 		D1(printk(KERN_DEBUG "JFFS2: Garbage collect thread is pid %d\n", pid));
-		down(&c->gc_thread_start);
+		wait_for_completion(&c->gc_thread_start);
 	}
  
 	return ret;
@@ -75,7 +75,7 @@ static int jffs2_garbage_collect_thread(void *_c)
 	allow_signal(SIGCONT);
 
 	c->gc_task = current;
-	up(&c->gc_thread_start);
+	complete(&c->gc_thread_start);
 
 	set_user_nice(current, 10);
 
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
index 350b82bd6529..1e21546622de 100644
--- a/include/linux/jffs2_fs_sb.h
+++ b/include/linux/jffs2_fs_sb.h
@@ -1,4 +1,4 @@
-/* $Id: jffs2_fs_sb.h,v 1.51 2005/02/28 08:21:06 dedekind Exp $ */
+/* $Id: jffs2_fs_sb.h,v 1.52 2005/05/19 16:12:17 gleixner Exp $ */
 
 #ifndef _JFFS2_FS_SB
 #define _JFFS2_FS_SB
@@ -32,7 +32,7 @@ struct jffs2_sb_info {
 	unsigned int flags;
 
 	struct task_struct *gc_task;	/* GC task struct */
-	struct semaphore gc_thread_start; /* GC thread start mutex */
+	struct completion gc_thread_start; /* GC thread start completion */
 	struct completion gc_thread_exit; /* GC thread exit completion port */
 
 	struct semaphore alloc_sem;	/* Used to protect all the following 
-- 
cgit v1.2.3-59-g8ed1b


From 3a3ab48c68de656736f091c6ed768fa8c110a7ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@tglx.tec.linutronix.de>
Date: Tue, 24 May 2005 20:50:18 +0200
Subject: [MTD] Make map_word_ff ware of the flash buswidth

map_word_ff() was setting the mapword to ~0UL regardless of the
buswidth of the mapped flash chip. The read_map functions are
buswidth aware and therefor the map_word_equal function failed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/map.h | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 115b14a634da..dd36d9433f00 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,6 +1,6 @@
 
 /* Overhauled routines for dealing with different mmap regions of flash */
-/* $Id: map.h,v 1.48 2005/02/16 15:54:59 nico Exp $ */
+/* $Id: map.h,v 1.49 2005/05/24 18:45:15 gleixner Exp $ */
 
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
@@ -340,13 +340,22 @@ static inline map_word map_word_load_partial(struct map_info *map, map_word orig
 	return orig;
 }
 
+#if BITS_PER_LONG < 64
+#define MAP_FF_LIMIT 4
+#else
+#define MAP_FF_LIMIT 8
+#endif
+
 static inline map_word map_word_ff(struct map_info *map)
 {
 	map_word r;
 	int i;
-
-	for (i=0; i<map_words(map); i++) {
-		r.x[i] = ~0UL;
+	
+	if (map_bank_width(map) < MAP_FF_LIMIT) {
+		r.x[0] = (1 << (8*map_bank_width(map))) - 1;
+	} else {
+		for (i=0; i<map_words(map); i++)
+			r.x[i] = ~0UL;
 	}
 	return r;
 }
-- 
cgit v1.2.3-59-g8ed1b


From b0435695cb3b3e0542c9a3f921f40b216ec37580 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@tglx.tec.linutronix.de>
Date: Wed, 25 May 2005 12:20:29 +0200
Subject: [MTD] map.h Use the correct macro and fix the resulting compiler
 warning

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/map.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index dd36d9433f00..dbd7b9b510d3 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,6 +1,6 @@
 
 /* Overhauled routines for dealing with different mmap regions of flash */
-/* $Id: map.h,v 1.49 2005/05/24 18:45:15 gleixner Exp $ */
+/* $Id: map.h,v 1.51 2005/05/25 10:15:29 gleixner Exp $ */
 
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
@@ -351,8 +351,9 @@ static inline map_word map_word_ff(struct map_info *map)
 	map_word r;
 	int i;
 	
-	if (map_bank_width(map) < MAP_FF_LIMIT) {
-		r.x[0] = (1 << (8*map_bank_width(map))) - 1;
+	if (map_bankwidth(map) < MAP_FF_LIMIT) {
+		int bw = 8 * map_bankwidth;
+		r.x[0] = (1 << bw) - 1;
 	} else {
 		for (i=0; i<map_words(map); i++)
 			r.x[i] = ~0UL;
-- 
cgit v1.2.3-59-g8ed1b


From a8aff8ab981cc8ef170e89d85094da722644a7fd Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@tglx.tec.linutronix.de>
Date: Wed, 25 May 2005 12:32:37 +0200
Subject: [MTD] Fix it really

tglx declares him self to be the idiot of the day.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/map.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index dbd7b9b510d3..142963f01d29 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -1,6 +1,6 @@
 
 /* Overhauled routines for dealing with different mmap regions of flash */
-/* $Id: map.h,v 1.51 2005/05/25 10:15:29 gleixner Exp $ */
+/* $Id: map.h,v 1.52 2005/05/25 10:29:41 gleixner Exp $ */
 
 #ifndef __LINUX_MTD_MAP_H__
 #define __LINUX_MTD_MAP_H__
@@ -352,7 +352,7 @@ static inline map_word map_word_ff(struct map_info *map)
 	int i;
 	
 	if (map_bankwidth(map) < MAP_FF_LIMIT) {
-		int bw = 8 * map_bankwidth;
+		int bw = 8 * map_bankwidth(map);
 		r.x[0] = (1 << bw) - 1;
 	} else {
 		for (i=0; i<map_words(map); i++)
-- 
cgit v1.2.3-59-g8ed1b


From f1f67a9874f1a4bba1adff6d694aa52e5f52ff1a Mon Sep 17 00:00:00 2001
From: "Nicolas S. Dade" <daden@symbol.com>
Date: Tue, 24 May 2005 01:46:34 -0700
Subject: [MTD] NAND: Add Hynix to manufacturer list

Signed-off-by: Nicolas S. Dade <daden@symbol.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_ids.c | 1 +
 include/linux/mtd/nand.h    | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 79945e6ce2b9..4b2bfae6f501 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -116,6 +116,7 @@ struct nand_manufacturers nand_manuf_ids[] = {
 	{NAND_MFR_NATIONAL, "National"},
 	{NAND_MFR_RENESAS, "Renesas"},
 	{NAND_MFR_STMICRO, "ST Micro"},
+        {NAND_MFR_HYNIX, "Hynix"},
 	{0x0, "Unknown"}
 };
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index cf25c7cfd0ba..bee78969cb21 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -384,6 +384,7 @@ struct nand_chip {
 #define NAND_MFR_NATIONAL	0x8f
 #define NAND_MFR_RENESAS	0x07
 #define NAND_MFR_STMICRO	0x20
+#define NAND_MFR_HYNIX          0xad
 
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
-- 
cgit v1.2.3-59-g8ed1b


From 024ac44c701d43f5e2d34bd6a35b2813a36e6010 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Sun, 29 May 2005 02:26:31 -0500
Subject: Input: This patch implements compat_ioctl for joydev.

       I've tested it with a Logitech WingMan Rumblepad on an x86-64
       machine, and on an ia32 machine to make sure I didn't break
       anything.

Signed-off-by: Jeremy Fitzhardinge <jeremy@goop.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Vojtech Pavlik <vojtech@suse.cz>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/joydev.c   | 116 +++++++++++++++++++++++++++++++++++++----------
 include/linux/joystick.h |  33 ++++++++++----
 2 files changed, 116 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index 627d343dfba1..816a585a0e6b 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -285,48 +285,33 @@ static unsigned int joydev_poll(struct file *file, poll_table *wait)
 		(POLLIN | POLLRDNORM) : 0) | (list->joydev->exist ? 0 : (POLLHUP | POLLERR));
 }
 
-static int joydev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+static int joydev_ioctl_common(struct joydev *joydev, unsigned int cmd, void __user *argp)
 {
-	struct joydev_list *list = file->private_data;
-	struct joydev *joydev = list->joydev;
 	struct input_dev *dev = joydev->handle.dev;
-	void __user *argp = (void __user *)arg;
 	int i, j;
 
-	if (!joydev->exist) return -ENODEV;
-
 	switch (cmd) {
 
 		case JS_SET_CAL:
 			return copy_from_user(&joydev->glue.JS_CORR, argp,
-				sizeof(struct JS_DATA_TYPE)) ? -EFAULT : 0;
+				sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0;
 		case JS_GET_CAL:
 			return copy_to_user(argp, &joydev->glue.JS_CORR,
-				sizeof(struct JS_DATA_TYPE)) ? -EFAULT : 0;
+				sizeof(joydev->glue.JS_CORR)) ? -EFAULT : 0;
 		case JS_SET_TIMEOUT:
-			return get_user(joydev->glue.JS_TIMEOUT, (int __user *) arg);
+			return get_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp);
 		case JS_GET_TIMEOUT:
-			return put_user(joydev->glue.JS_TIMEOUT, (int __user *) arg);
-		case JS_SET_TIMELIMIT:
-			return get_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg);
-		case JS_GET_TIMELIMIT:
-			return put_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg);
-		case JS_SET_ALL:
-			return copy_from_user(&joydev->glue, argp,
-						sizeof(struct JS_DATA_SAVE_TYPE)) ? -EFAULT : 0;
-		case JS_GET_ALL:
-			return copy_to_user(argp, &joydev->glue,
-						sizeof(struct JS_DATA_SAVE_TYPE)) ? -EFAULT : 0;
+			return put_user(joydev->glue.JS_TIMEOUT, (s32 __user *) argp);
 
 		case JSIOCGVERSION:
-			return put_user(JS_VERSION, (__u32 __user *) arg);
+			return put_user(JS_VERSION, (__u32 __user *) argp);
 		case JSIOCGAXES:
-			return put_user(joydev->nabs, (__u8 __user *) arg);
+			return put_user(joydev->nabs, (__u8 __user *) argp);
 		case JSIOCGBUTTONS:
-			return put_user(joydev->nkey, (__u8 __user *) arg);
+			return put_user(joydev->nkey, (__u8 __user *) argp);
 		case JSIOCSCORR:
 			if (copy_from_user(joydev->corr, argp,
-				      sizeof(struct js_corr) * joydev->nabs))
+				      sizeof(joydev->corr[0]) * joydev->nabs))
 			    return -EFAULT;
 			for (i = 0; i < joydev->nabs; i++) {
 				j = joydev->abspam[i];
@@ -335,7 +320,7 @@ static int joydev_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 			return 0;
 		case JSIOCGCORR:
 			return copy_to_user(argp, joydev->corr,
-						sizeof(struct js_corr) * joydev->nabs) ? -EFAULT : 0;
+						sizeof(joydev->corr[0]) * joydev->nabs) ? -EFAULT : 0;
 		case JSIOCSAXMAP:
 			if (copy_from_user(joydev->abspam, argp, sizeof(__u8) * (ABS_MAX + 1)))
 				return -EFAULT;
@@ -371,6 +356,84 @@ static int joydev_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 	return -EINVAL;
 }
 
+#ifdef CONFIG_COMPAT
+static long joydev_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct joydev_list *list = file->private_data;
+	struct joydev *joydev = list->joydev;
+	void __user *argp = (void __user *)arg;
+	s32 tmp32;
+	struct JS_DATA_SAVE_TYPE_32 ds32;
+	int err;
+
+	if (!joydev->exist) return -ENODEV;
+	switch(cmd) {
+	case JS_SET_TIMELIMIT:
+		err = get_user(tmp32, (s32 __user *) arg);
+		if (err == 0)
+			joydev->glue.JS_TIMELIMIT = tmp32;
+		break;
+	case JS_GET_TIMELIMIT:
+		tmp32 = joydev->glue.JS_TIMELIMIT;
+		err = put_user(tmp32, (s32 __user *) arg);
+		break;
+
+	case JS_SET_ALL:
+		err = copy_from_user(&ds32, argp,
+				     sizeof(ds32)) ? -EFAULT : 0;
+		if (err == 0) {
+			joydev->glue.JS_TIMEOUT    = ds32.JS_TIMEOUT;
+			joydev->glue.BUSY          = ds32.BUSY;
+			joydev->glue.JS_EXPIRETIME = ds32.JS_EXPIRETIME;
+			joydev->glue.JS_TIMELIMIT  = ds32.JS_TIMELIMIT;
+			joydev->glue.JS_SAVE       = ds32.JS_SAVE;
+			joydev->glue.JS_CORR       = ds32.JS_CORR;
+		}
+		break;
+
+	case JS_GET_ALL:
+		ds32.JS_TIMEOUT    = joydev->glue.JS_TIMEOUT;
+		ds32.BUSY          = joydev->glue.BUSY;
+		ds32.JS_EXPIRETIME = joydev->glue.JS_EXPIRETIME;
+		ds32.JS_TIMELIMIT  = joydev->glue.JS_TIMELIMIT;
+		ds32.JS_SAVE       = joydev->glue.JS_SAVE;
+		ds32.JS_CORR       = joydev->glue.JS_CORR;
+
+		err = copy_to_user(argp, &ds32,
+					  sizeof(ds32)) ? -EFAULT : 0;
+		break;
+
+	default:
+		err = joydev_ioctl_common(joydev, cmd, argp);
+	}
+	return err;
+}
+#endif /* CONFIG_COMPAT */
+
+static int joydev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct joydev_list *list = file->private_data;
+	struct joydev *joydev = list->joydev;
+	void __user *argp = (void __user *)arg;
+
+	if (!joydev->exist) return -ENODEV;
+
+	switch(cmd) {
+		case JS_SET_TIMELIMIT:
+			return get_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg);
+		case JS_GET_TIMELIMIT:
+			return put_user(joydev->glue.JS_TIMELIMIT, (long __user *) arg);
+		case JS_SET_ALL:
+			return copy_from_user(&joydev->glue, argp,
+						sizeof(joydev->glue)) ? -EFAULT : 0;
+		case JS_GET_ALL:
+			return copy_to_user(argp, &joydev->glue,
+						sizeof(joydev->glue)) ? -EFAULT : 0;
+		default:
+			return joydev_ioctl_common(joydev, cmd, argp);
+	}
+}
+
 static struct file_operations joydev_fops = {
 	.owner =	THIS_MODULE,
 	.read =		joydev_read,
@@ -379,6 +442,9 @@ static struct file_operations joydev_fops = {
 	.open =		joydev_open,
 	.release =	joydev_release,
 	.ioctl =	joydev_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	joydev_compat_ioctl,
+#endif
 	.fasync =	joydev_fasync,
 };
 
diff --git a/include/linux/joystick.h b/include/linux/joystick.h
index b7e0ab622cd7..06b9af77eb7f 100644
--- a/include/linux/joystick.h
+++ b/include/linux/joystick.h
@@ -111,18 +111,35 @@ struct js_corr {
 #define JS_SET_ALL		8
 
 struct JS_DATA_TYPE {
-	int buttons;
-	int x;
-	int y;
+	__s32 buttons;
+	__s32 x;
+	__s32 y;
 };
 
-struct JS_DATA_SAVE_TYPE {
-	int JS_TIMEOUT;
-	int BUSY;
-	long JS_EXPIRETIME;
-	long JS_TIMELIMIT;
+struct JS_DATA_SAVE_TYPE_32 {
+	__s32 JS_TIMEOUT;
+	__s32 BUSY;
+	__s32 JS_EXPIRETIME;
+	__s32 JS_TIMELIMIT;
 	struct JS_DATA_TYPE JS_SAVE;
 	struct JS_DATA_TYPE JS_CORR;
 };
 
+struct JS_DATA_SAVE_TYPE_64 {
+	__s32 JS_TIMEOUT;
+	__s32 BUSY;
+	__s64 JS_EXPIRETIME;
+	__s64 JS_TIMELIMIT;
+	struct JS_DATA_TYPE JS_SAVE;
+	struct JS_DATA_TYPE JS_CORR;
+};
+
+#if BITS_PER_LONG == 64
+#define JS_DATA_SAVE_TYPE JS_DATA_SAVE_TYPE_64
+#elif BITS_PER_LONG == 32
+#define JS_DATA_SAVE_TYPE JS_DATA_SAVE_TYPE_32
+#else
+#error Unexpected BITS_PER_LONG
+#endif
+
 #endif /* _LINUX_JOYSTICK_H */
-- 
cgit v1.2.3-59-g8ed1b


From 0fbf87caf70acec0c435233fbc39c7bd0aca3ca6 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Sun, 29 May 2005 02:29:25 -0500
Subject: Input: add semaphore and user count to input_dev structure;       
 serialize open and close calls and ensure that device's        open and close
 methods are only called when first user        opens it or last user closes
 it.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/input.c | 33 ++++++++++++++++++++++++++++-----
 include/linux/input.h |  4 ++++
 2 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/input.c b/drivers/input/input.c
index 3385dd03abfc..1885f369e3e2 100644
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -219,10 +219,24 @@ void input_release_device(struct input_handle *handle)
 
 int input_open_device(struct input_handle *handle)
 {
+	struct input_dev *dev = handle->dev;
+	int err;
+
+	err = down_interruptible(&dev->sem);
+	if (err)
+		return err;
+
 	handle->open++;
-	if (handle->dev->open)
-		return handle->dev->open(handle->dev);
-	return 0;
+
+	if (!dev->users++ && dev->open)
+		err = dev->open(dev);
+
+	if (err)
+		handle->open--;
+
+	up(&dev->sem);
+
+	return err;
 }
 
 int input_flush_device(struct input_handle* handle, struct file* file)
@@ -235,10 +249,17 @@ int input_flush_device(struct input_handle* handle, struct file* file)
 
 void input_close_device(struct input_handle *handle)
 {
+	struct input_dev *dev = handle->dev;
+
 	input_release_device(handle);
-	if (handle->dev->close)
-		handle->dev->close(handle->dev);
+
+	down(&dev->sem);
+
+	if (!--dev->users && dev->close)
+		dev->close(dev);
 	handle->open--;
+
+	up(&dev->sem);
 }
 
 static void input_link_handle(struct input_handle *handle)
@@ -415,6 +436,8 @@ void input_register_device(struct input_dev *dev)
 
 	set_bit(EV_SYN, dev->evbit);
 
+	init_MUTEX(&dev->sem);
+
 	/*
 	 * If delay and period are pre-set by the driver, then autorepeating
 	 * is handled by the driver itself and we don't do it in input.c.
diff --git a/include/linux/input.h b/include/linux/input.h
index 72731d7d189e..43e8ecec602b 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -859,6 +859,10 @@ struct input_dev {
 	int (*erase_effect)(struct input_dev *dev, int effect_id);
 
 	struct input_handle *grab;
+
+	struct semaphore sem;	/* serializes open and close operations */
+	unsigned int users;
+
 	struct device *dev;
 
 	struct list_head	h_list;
-- 
cgit v1.2.3-59-g8ed1b


From c611763d048990de5cdf848d97af6392f8fa7430 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Wed, 1 Jun 2005 02:39:51 -0500
Subject: Input: add ps2_drain() to libps2 to allow reading and discarding     
   given number of bytes from device. Change ps2_command to        allow using
 0 as command ID and actually pass it to the        device instead of working
 as a drain.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/alps.c   |  3 +--
 drivers/input/serio/libps2.c | 46 ++++++++++++++++++++++++++++++++++++--------
 include/linux/libps2.h       |  1 +
 3 files changed, 40 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/mouse/alps.c b/drivers/input/mouse/alps.c
index 2679a165d399..ffdc82313192 100644
--- a/drivers/input/mouse/alps.c
+++ b/drivers/input/mouse/alps.c
@@ -270,7 +270,6 @@ static struct alps_model_info *alps_get_model(struct psmouse *psmouse, int *vers
 static int alps_passthrough_mode(struct psmouse *psmouse, int enable)
 {
 	struct ps2dev *ps2dev = &psmouse->ps2dev;
-	unsigned char param[3];
 	int cmd = enable ? PSMOUSE_CMD_SETSCALE21 : PSMOUSE_CMD_SETSCALE11;
 
 	if (ps2_command(ps2dev, NULL, cmd) ||
@@ -280,7 +279,7 @@ static int alps_passthrough_mode(struct psmouse *psmouse, int enable)
 		return -1;
 
 	/* we may get 3 more bytes, just ignore them */
-	ps2_command(ps2dev, param, 0x0300);
+	ps2_drain(ps2dev, 3, 100);
 
 	return 0;
 }
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index c978657068c5..92b92ee03791 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -29,6 +29,7 @@ MODULE_LICENSE("GPL");
 
 EXPORT_SYMBOL(ps2_init);
 EXPORT_SYMBOL(ps2_sendbyte);
+EXPORT_SYMBOL(ps2_drain);
 EXPORT_SYMBOL(ps2_command);
 EXPORT_SYMBOL(ps2_schedule_command);
 EXPORT_SYMBOL(ps2_handle_ack);
@@ -45,11 +46,11 @@ struct ps2work {
 
 
 /*
- * ps2_sendbyte() sends a byte to the mouse, and waits for acknowledge.
- * It doesn't handle retransmission, though it could - because when there would
- * be need for retransmissions, the mouse has to be replaced anyway.
+ * ps2_sendbyte() sends a byte to the device and waits for acknowledge.
+ * It doesn't handle retransmission, though it could - because if there
+ * is a need for retransmissions device has to be replaced anyway.
  *
- * ps2_sendbyte() can only be called from a process context
+ * ps2_sendbyte() can only be called from a process context.
  */
 
 int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout)
@@ -71,6 +72,31 @@ int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout)
 	return -ps2dev->nak;
 }
 
+/*
+ * ps2_drain() waits for device to transmit requested number of bytes
+ * and discards them.
+ */
+
+void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout)
+{
+	if (maxbytes > sizeof(ps2dev->cmdbuf)) {
+		WARN_ON(1);
+		maxbytes = sizeof(ps2dev->cmdbuf);
+	}
+
+	down(&ps2dev->cmd_sem);
+
+	serio_pause_rx(ps2dev->serio);
+	ps2dev->flags = PS2_FLAG_CMD;
+	ps2dev->cmdcnt = maxbytes;
+	serio_continue_rx(ps2dev->serio);
+
+	wait_event_timeout(ps2dev->wait,
+			   !(ps2dev->flags & PS2_FLAG_CMD),
+			   msecs_to_jiffies(timeout));
+	up(&ps2dev->cmd_sem);
+}
+
 /*
  * ps2_command() sends a command and its parameters to the mouse,
  * then waits for the response and puts it in the param array.
@@ -86,6 +112,11 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 	int rc = -1;
 	int i;
 
+	if (receive > sizeof(ps2dev->cmdbuf)) {
+		WARN_ON(1);
+		return -1;
+	}
+
 	down(&ps2dev->cmd_sem);
 
 	serio_pause_rx(ps2dev->serio);
@@ -101,10 +132,9 @@ int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command)
 	 * ACKing the reset command, and so it can take a long
 	 * time before the ACK arrrives.
 	 */
-	if (command & 0xff)
-		if (ps2_sendbyte(ps2dev, command & 0xff,
-			command == PS2_CMD_RESET_BAT ? 1000 : 200))
-			goto out;
+	if (ps2_sendbyte(ps2dev, command & 0xff,
+			 command == PS2_CMD_RESET_BAT ? 1000 : 200))
+		goto out;
 
 	for (i = 0; i < send; i++)
 		if (ps2_sendbyte(ps2dev, param[i], 200))
diff --git a/include/linux/libps2.h b/include/linux/libps2.h
index 923bdbc6d9e4..a710bddda4eb 100644
--- a/include/linux/libps2.h
+++ b/include/linux/libps2.h
@@ -41,6 +41,7 @@ struct ps2dev {
 
 void ps2_init(struct ps2dev *ps2dev, struct serio *serio);
 int ps2_sendbyte(struct ps2dev *ps2dev, unsigned char byte, int timeout);
+void ps2_drain(struct ps2dev *ps2dev, int maxbytes, int timeout);
 int ps2_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_schedule_command(struct ps2dev *ps2dev, unsigned char *param, int command);
 int ps2_handle_ack(struct ps2dev *ps2dev, unsigned char data);
-- 
cgit v1.2.3-59-g8ed1b


From dbf4ccd6043e58ed32fbf253fb3f0a9991e4c13a Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Wed, 1 Jun 2005 02:40:01 -0500
Subject: Input: psmouse - export protocol as a sysfs per-device attribute     
   to allow easy switching at run-time.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/mouse/psmouse-base.c | 291 +++++++++++++++++++++++++++++++------
 drivers/input/mouse/psmouse.h      |   1 +
 drivers/input/serio/serio.c        |  18 ++-
 include/linux/serio.h              |   6 +
 4 files changed, 266 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/mouse/psmouse-base.c b/drivers/input/mouse/psmouse-base.c
index 259e6b70544b..19785a6c5abd 100644
--- a/drivers/input/mouse/psmouse-base.c
+++ b/drivers/input/mouse/psmouse-base.c
@@ -32,15 +32,14 @@ MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>");
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 
-static unsigned int psmouse_max_proto = -1U;
+static unsigned int psmouse_max_proto = PSMOUSE_AUTO;
 static int psmouse_set_maxproto(const char *val, struct kernel_param *kp);
 static int psmouse_get_maxproto(char *buffer, struct kernel_param *kp);
-static char *psmouse_proto_abbrev[] = { NULL, "bare", NULL, NULL, NULL, "imps", "exps", NULL, NULL, "lifebook" };
 #define param_check_proto_abbrev(name, p)	__param_check(name, p, unsigned int)
 #define param_set_proto_abbrev			psmouse_set_maxproto
 #define param_get_proto_abbrev			psmouse_get_maxproto
 module_param_named(proto, psmouse_max_proto, proto_abbrev, 0644);
-MODULE_PARM_DESC(proto, "Highest protocol extension to probe (bare, imps, exps, lifebook, any). Useful for KVM switches.");
+MODULE_PARM_DESC(proto, "Highest protocol extension to probe (bare, imps, exps, any). Useful for KVM switches.");
 
 static unsigned int psmouse_resolution = 200;
 module_param_named(resolution, psmouse_resolution, uint, 0644);
@@ -58,6 +57,7 @@ static unsigned int psmouse_resetafter;
 module_param_named(resetafter, psmouse_resetafter, uint, 0644);
 MODULE_PARM_DESC(resetafter, "Reset device after so many bad packets (0 = never).");
 
+PSMOUSE_DEFINE_ATTR(protocol);
 PSMOUSE_DEFINE_ATTR(rate);
 PSMOUSE_DEFINE_ATTR(resolution);
 PSMOUSE_DEFINE_ATTR(resetafter);
@@ -77,7 +77,14 @@ __obsolete_setup("psmouse_rate=");
  */
 static DECLARE_MUTEX(psmouse_sem);
 
-static char *psmouse_protocols[] = { "None", "PS/2", "PS2++", "ThinkPS/2", "GenPS/2", "ImPS/2", "ImExPS/2", "SynPS/2", "AlpsPS/2", "LBPS/2" };
+struct psmouse_protocol {
+	enum psmouse_type type;
+	char *name;
+	char *alias;
+	int maxproto;
+	int (*detect)(struct psmouse *, int);
+	int (*init)(struct psmouse *);
+};
 
 /*
  * psmouse_process_byte() analyzes the PS/2 data stream and reports
@@ -417,12 +424,15 @@ static int thinking_detect(struct psmouse *psmouse, int set_properties)
  */
 static int ps2bare_detect(struct psmouse *psmouse, int set_properties)
 {
-	if (!psmouse->vendor) psmouse->vendor = "Generic";
-	if (!psmouse->name) psmouse->name = "Mouse";
+	if (set_properties) {
+		if (!psmouse->vendor) psmouse->vendor = "Generic";
+		if (!psmouse->name) psmouse->name = "Mouse";
+	}
 
 	return 0;
 }
 
+
 /*
  * psmouse_extensions() probes for any extensions to the basic PS/2 protocol
  * the mouse may have.
@@ -437,9 +447,7 @@ static int psmouse_extensions(struct psmouse *psmouse,
  * We always check for lifebook because it does not disturb mouse
  * (it only checks DMI information).
  */
-	if (lifebook_detect(psmouse, set_properties) == 0 ||
-	    max_proto == PSMOUSE_LIFEBOOK) {
-
+	if (lifebook_detect(psmouse, set_properties) == 0) {
 		if (max_proto > PSMOUSE_IMEX) {
 			if (!set_properties || lifebook_init(psmouse) == 0)
 				return PSMOUSE_LIFEBOOK;
@@ -529,6 +537,103 @@ static int psmouse_extensions(struct psmouse *psmouse,
 	return PSMOUSE_PS2;
 }
 
+static struct psmouse_protocol psmouse_protocols[] = {
+	{
+		.type		= PSMOUSE_PS2,
+		.name		= "PS/2",
+		.alias		= "bare",
+		.maxproto	= 1,
+		.detect		= ps2bare_detect,
+	},
+	{
+		.type		= PSMOUSE_PS2PP,
+		.name		= "PS2++",
+		.alias		= "logitech",
+		.detect		= ps2pp_init,
+	},
+	{
+		.type		= PSMOUSE_THINKPS,
+		.name		= "ThinkPS/2",
+		.alias		= "thinkps",
+		.detect		= thinking_detect,
+	},
+	{
+		.type		= PSMOUSE_GENPS,
+		.name		= "GenPS/2",
+		.alias		= "genius",
+		.detect		= genius_detect,
+	},
+	{
+		.type		= PSMOUSE_IMPS,
+		.name		= "ImPS/2",
+		.alias		= "imps",
+		.maxproto	= 1,
+		.detect		= intellimouse_detect,
+	},
+	{
+		.type		= PSMOUSE_IMEX,
+		.name		= "ImExPS/2",
+		.alias		= "exps",
+		.maxproto	= 1,
+		.detect		= im_explorer_detect,
+	},
+	{
+		.type		= PSMOUSE_SYNAPTICS,
+		.name		= "SynPS/2",
+		.alias		= "synaptics",
+		.detect		= synaptics_detect,
+		.init		= synaptics_init,
+	},
+	{
+		.type		= PSMOUSE_ALPS,
+		.name		= "AlpsPS/2",
+		.alias		= "alps",
+		.detect		= alps_detect,
+		.init		= alps_init,
+	},
+	{
+		.type		= PSMOUSE_LIFEBOOK,
+		.name		= "LBPS/2",
+		.alias		= "lifebook",
+		.init		= lifebook_init,
+	},
+	{
+		.type		= PSMOUSE_AUTO,
+		.name		= "auto",
+		.alias		= "any",
+		.maxproto	= 1,
+	},
+};
+
+static struct psmouse_protocol *psmouse_protocol_by_type(enum psmouse_type type)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(psmouse_protocols); i++)
+		if (psmouse_protocols[i].type == type)
+			return &psmouse_protocols[i];
+
+	WARN_ON(1);
+	return &psmouse_protocols[0];
+}
+
+static struct psmouse_protocol *psmouse_protocol_by_name(const char *name, size_t len)
+{
+	struct psmouse_protocol *p;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(psmouse_protocols); i++) {
+		p = &psmouse_protocols[i];
+
+		if ((strlen(p->name) == len && !strncmp(p->name, name, len)) ||
+		    (strlen(p->alias) == len && !strncmp(p->alias, name, len)))
+			return &psmouse_protocols[i];
+	}
+
+	return NULL;
+}
+
+
 /*
  * psmouse_probe() probes for a PS/2 mouse.
  */
@@ -680,6 +785,7 @@ static void psmouse_disconnect(struct serio *serio)
 
 	psmouse = serio_get_drvdata(serio);
 
+	device_remove_file(&serio->dev, &psmouse_attr_protocol);
 	device_remove_file(&serio->dev, &psmouse_attr_rate);
 	device_remove_file(&serio->dev, &psmouse_attr_resolution);
 	device_remove_file(&serio->dev, &psmouse_attr_resetafter);
@@ -712,6 +818,49 @@ static void psmouse_disconnect(struct serio *serio)
 	up(&psmouse_sem);
 }
 
+static int psmouse_switch_protocol(struct psmouse *psmouse, struct psmouse_protocol *proto)
+{
+	memset(&psmouse->dev, 0, sizeof(struct input_dev));
+
+	init_input_dev(&psmouse->dev);
+
+	psmouse->dev.private = psmouse;
+	psmouse->dev.dev = &psmouse->ps2dev.serio->dev;
+
+	psmouse->dev.evbit[0] = BIT(EV_KEY) | BIT(EV_REL);
+	psmouse->dev.keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT);
+	psmouse->dev.relbit[0] = BIT(REL_X) | BIT(REL_Y);
+
+	psmouse->set_rate = psmouse_set_rate;
+	psmouse->set_resolution = psmouse_set_resolution;
+	psmouse->protocol_handler = psmouse_process_byte;
+	psmouse->pktsize = 3;
+
+	if (proto && (proto->detect || proto->init)) {
+		if (proto->detect && proto->detect(psmouse, 1) < 0)
+			return -1;
+
+		if (proto->init && proto->init(psmouse) < 0)
+			return -1;
+
+		psmouse->type = proto->type;
+	}
+	else
+		psmouse->type = psmouse_extensions(psmouse, psmouse_max_proto, 1);
+
+	sprintf(psmouse->devname, "%s %s %s",
+		psmouse_protocol_by_type(psmouse->type)->name, psmouse->vendor, psmouse->name);
+
+	psmouse->dev.name = psmouse->devname;
+	psmouse->dev.phys = psmouse->phys;
+	psmouse->dev.id.bustype = BUS_I8042;
+	psmouse->dev.id.vendor = 0x0002;
+	psmouse->dev.id.product = psmouse->type;
+	psmouse->dev.id.version = psmouse->model;
+
+	return 0;
+}
+
 /*
  * psmouse_connect() is a callback from the serio module when
  * an unhandled serio port is found.
@@ -739,11 +888,7 @@ static int psmouse_connect(struct serio *serio, struct serio_driver *drv)
 
 	ps2_init(&psmouse->ps2dev, serio);
 	sprintf(psmouse->phys, "%s/input0", serio->phys);
-	psmouse->dev.evbit[0] = BIT(EV_KEY) | BIT(EV_REL);
-	psmouse->dev.keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_MIDDLE) | BIT(BTN_RIGHT);
-	psmouse->dev.relbit[0] = BIT(REL_X) | BIT(REL_Y);
-	psmouse->dev.private = psmouse;
-	psmouse->dev.dev = &serio->dev;
+
 	psmouse_set_state(psmouse, PSMOUSE_INITIALIZING);
 
 	serio_set_drvdata(serio, psmouse);
@@ -767,25 +912,10 @@ static int psmouse_connect(struct serio *serio, struct serio_driver *drv)
 	psmouse->resolution = psmouse_resolution;
 	psmouse->resetafter = psmouse_resetafter;
 	psmouse->smartscroll = psmouse_smartscroll;
-	psmouse->set_rate = psmouse_set_rate;
-	psmouse->set_resolution = psmouse_set_resolution;
-	psmouse->protocol_handler = psmouse_process_byte;
-	psmouse->pktsize = 3;
-
-	psmouse->type = psmouse_extensions(psmouse, psmouse_max_proto, 1);
-
-	sprintf(psmouse->devname, "%s %s %s",
-		psmouse_protocols[psmouse->type], psmouse->vendor, psmouse->name);
 
-	psmouse->dev.name = psmouse->devname;
-	psmouse->dev.phys = psmouse->phys;
-	psmouse->dev.id.bustype = BUS_I8042;
-	psmouse->dev.id.vendor = 0x0002;
-	psmouse->dev.id.product = psmouse->type;
-	psmouse->dev.id.version = psmouse->model;
+	psmouse_switch_protocol(psmouse, NULL);
 
 	input_register_device(&psmouse->dev);
-
 	printk(KERN_INFO "input: %s on %s\n", psmouse->devname, serio->phys);
 
 	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
@@ -795,6 +925,7 @@ static int psmouse_connect(struct serio *serio, struct serio_driver *drv)
 	if (parent && parent->pt_activate)
 		parent->pt_activate(parent);
 
+	device_create_file(&serio->dev, &psmouse_attr_protocol);
 	device_create_file(&serio->dev, &psmouse_attr_rate);
 	device_create_file(&serio->dev, &psmouse_attr_resolution);
 	device_create_file(&serio->dev, &psmouse_attr_resetafter);
@@ -946,11 +1077,14 @@ ssize_t psmouse_attr_set_helper(struct device *dev, const char *buf, size_t coun
 		parent = serio_get_drvdata(serio->parent);
 		psmouse_deactivate(parent);
 	}
+
 	psmouse_deactivate(psmouse);
 
 	retval = handler(psmouse, buf, count);
 
-	psmouse_activate(psmouse);
+	if (retval != -ENODEV)
+		psmouse_activate(psmouse);
+
 	if (parent)
 		psmouse_activate(parent);
 
@@ -961,6 +1095,75 @@ ssize_t psmouse_attr_set_helper(struct device *dev, const char *buf, size_t coun
 	return retval;
 }
 
+static ssize_t psmouse_attr_show_protocol(struct psmouse *psmouse, char *buf)
+{
+	return sprintf(buf, "%s\n", psmouse_protocol_by_type(psmouse->type)->name);
+}
+
+static ssize_t psmouse_attr_set_protocol(struct psmouse *psmouse, const char *buf, size_t count)
+{
+	struct serio *serio = psmouse->ps2dev.serio;
+	struct psmouse *parent = NULL;
+	struct psmouse_protocol *proto;
+	int retry = 0;
+
+	if (!(proto = psmouse_protocol_by_name(buf, count)))
+		return -EINVAL;
+
+	if (psmouse->type == proto->type)
+		return count;
+
+	while (serio->child) {
+		if (++retry > 3) {
+			printk(KERN_WARNING "psmouse: failed to destroy child port, protocol change aborted.\n");
+			return -EIO;
+		}
+
+		up(&psmouse_sem);
+		serio_unpin_driver(serio);
+		serio_unregister_child_port(serio);
+		serio_pin_driver_uninterruptible(serio);
+		down(&psmouse_sem);
+
+		if (serio->drv != &psmouse_drv)
+			return -ENODEV;
+
+		if (psmouse->type == proto->type)
+			return count; /* switched by other thread */
+	}
+
+	if (serio->parent && serio->id.type == SERIO_PS_PSTHRU) {
+		parent = serio_get_drvdata(serio->parent);
+		if (parent->pt_deactivate)
+			parent->pt_deactivate(parent);
+	}
+
+	if (psmouse->disconnect)
+		psmouse->disconnect(psmouse);
+
+	psmouse_set_state(psmouse, PSMOUSE_IGNORE);
+	input_unregister_device(&psmouse->dev);
+
+	psmouse_set_state(psmouse, PSMOUSE_INITIALIZING);
+
+	if (psmouse_switch_protocol(psmouse, proto) < 0) {
+		psmouse_reset(psmouse);
+		/* default to PSMOUSE_PS2 */
+		psmouse_switch_protocol(psmouse, &psmouse_protocols[0]);
+	}
+
+	psmouse_initialize(psmouse);
+	psmouse_set_state(psmouse, PSMOUSE_CMD_MODE);
+
+	input_register_device(&psmouse->dev);
+	printk(KERN_INFO "input: %s on %s\n", psmouse->devname, serio->phys);
+
+	if (parent && parent->pt_activate)
+		parent->pt_activate(parent);
+
+	return count;
+}
+
 static ssize_t psmouse_attr_show_rate(struct psmouse *psmouse, char *buf)
 {
 	return sprintf(buf, "%d\n", psmouse->rate);
@@ -1017,34 +1220,26 @@ static ssize_t psmouse_attr_set_resetafter(struct psmouse *psmouse, const char *
 
 static int psmouse_set_maxproto(const char *val, struct kernel_param *kp)
 {
-	int i;
+	struct psmouse_protocol *proto;
 
 	if (!val)
 		return -EINVAL;
 
-	if (!strncmp(val, "any", 3)) {
-		*((unsigned int *)kp->arg) = -1U;
-		return 0;
-	}
+	proto = psmouse_protocol_by_name(val, strlen(val));
 
-	for (i = 0; i < ARRAY_SIZE(psmouse_proto_abbrev); i++) {
-		if (!psmouse_proto_abbrev[i])
-			continue;
+	if (!proto || !proto->maxproto)
+		return -EINVAL;
 
-		if (!strncmp(val, psmouse_proto_abbrev[i], strlen(psmouse_proto_abbrev[i]))) {
-			*((unsigned int *)kp->arg) = i;
-			return 0;
-		}
-	}
+	*((unsigned int *)kp->arg) = proto->type;
 
-	return -EINVAL;					\
+	return 0;					\
 }
 
 static int psmouse_get_maxproto(char *buffer, struct kernel_param *kp)
 {
-	return sprintf(buffer, "%s\n",
-			psmouse_max_proto < ARRAY_SIZE(psmouse_proto_abbrev) ?
-				psmouse_proto_abbrev[psmouse_max_proto] : "any");
+	int type = *((unsigned int *)kp->arg);
+
+	return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name);
 }
 
 static int __init psmouse_init(void)
diff --git a/drivers/input/mouse/psmouse.h b/drivers/input/mouse/psmouse.h
index 4848be627a6f..dc8e9ae07f32 100644
--- a/drivers/input/mouse/psmouse.h
+++ b/drivers/input/mouse/psmouse.h
@@ -78,6 +78,7 @@ enum psmouse_type {
 	PSMOUSE_SYNAPTICS,
 	PSMOUSE_ALPS,
 	PSMOUSE_LIFEBOOK,
+	PSMOUSE_AUTO		/* This one should always be last */
 };
 
 int psmouse_sliced_command(struct psmouse *psmouse, unsigned char command);
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index b82815a0b65b..615bf62ad468 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -42,6 +42,7 @@ MODULE_LICENSE("GPL");
 EXPORT_SYMBOL(serio_interrupt);
 EXPORT_SYMBOL(__serio_register_port);
 EXPORT_SYMBOL(serio_unregister_port);
+EXPORT_SYMBOL(serio_unregister_child_port);
 EXPORT_SYMBOL(__serio_unregister_port_delayed);
 EXPORT_SYMBOL(__serio_register_driver);
 EXPORT_SYMBOL(serio_unregister_driver);
@@ -179,12 +180,12 @@ static void serio_queue_event(void *object, struct module *owner,
 	spin_lock_irqsave(&serio_event_lock, flags);
 
 	/*
- 	 * Scan event list for the other events for the same serio port,
+	 * Scan event list for the other events for the same serio port,
 	 * starting with the most recent one. If event is the same we
 	 * do not need add new one. If event is of different type we
 	 * need to add this event and should not look further because
 	 * we need to preseve sequence of distinct events.
- 	 */
+	 */
 	list_for_each_entry_reverse(event, &serio_event_list, node) {
 		if (event->object == object) {
 			if (event->type == event_type)
@@ -653,6 +654,19 @@ void serio_unregister_port(struct serio *serio)
 	up(&serio_sem);
 }
 
+/*
+ * Safely unregisters child port if one is present.
+ */
+void serio_unregister_child_port(struct serio *serio)
+{
+	down(&serio_sem);
+	if (serio->child) {
+		serio_disconnect_port(serio->child);
+		serio_destroy_port(serio->child);
+	}
+	up(&serio_sem);
+}
+
 /*
  * Submits register request to kseriod for subsequent execution.
  * Can be used when it is not obvious whether the serio_sem is
diff --git a/include/linux/serio.h b/include/linux/serio.h
index a2d3b9ae06f4..aa4d6493a034 100644
--- a/include/linux/serio.h
+++ b/include/linux/serio.h
@@ -83,6 +83,7 @@ static inline void serio_register_port(struct serio *serio)
 }
 
 void serio_unregister_port(struct serio *serio);
+void serio_unregister_child_port(struct serio *serio);
 void __serio_unregister_port_delayed(struct serio *serio, struct module *owner);
 static inline void serio_unregister_port_delayed(struct serio *serio)
 {
@@ -153,6 +154,11 @@ static inline int serio_pin_driver(struct serio *serio)
 	return down_interruptible(&serio->drv_sem);
 }
 
+static inline void serio_pin_driver_uninterruptible(struct serio *serio)
+{
+	down(&serio->drv_sem);
+}
+
 static inline void serio_unpin_driver(struct serio *serio)
 {
 	up(&serio->drv_sem);
-- 
cgit v1.2.3-59-g8ed1b


From 986a80d5c154808cc78170584670324a22fd8219 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <juhl-lkml@dif.dk>
Date: Thu, 16 Jun 2005 15:14:00 -0700
Subject: [PATCH] avoid signed vs unsigned comparison in efi_range_is_wc()

warning when building with gcc -W :

 include/linux/efi.h: In function `efi_range_is_wc':
 include/linux/efi.h:320: warning: comparison between signed and unsigned

It looks to me like a significantly large 'len' passed in could cause the
loop to never end. Isn't it safer to make 'i' an unsigned long as well?
Like this little patch below (which of course also kills the warning) :

Signed-off-by: Jesper Juhl <juhl-lkml@dif.dk>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/efi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/efi.h b/include/linux/efi.h
index 047e7222df7a..73781ec165b4 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -315,7 +315,7 @@ extern struct efi_memory_map memmap;
  */
 static inline int efi_range_is_wc(unsigned long start, unsigned long len)
 {
-	int i;
+	unsigned long i;
 
 	for (i = 0; i < len; i += (1UL << EFI_PAGE_SHIFT)) {
 		unsigned long paddr = __pa(start + i);
-- 
cgit v1.2.3-59-g8ed1b


From b3d5496ea5915fa4848fe307af9f7097f312e932 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Sat, 2 Apr 2005 20:31:02 +0200
Subject: [PATCH] I2C: Kill address ranges in non-sensors i2c chip drivers

Some months ago, you killed the address ranges mechanism from all
sensors i2c chip drivers (both the module parameters and the in-code
address lists). I think it was a very good move, as the ranges can
easily be replaced by individual addresses, and this allowed for
significant cleanups in the i2c core (let alone the impressive size
shrink for all these drivers).

Unfortunately you did not do the same for non-sensors i2c chip drivers.
These need the address ranges even less, so we could get rid of the
ranges here as well for another significant i2c core cleanup. Here comes
a patch which does just that. Since the process is exactly the same as
what you did for the other drivers set already, I did not split this one
in parts.

A documentation update is included.

The change saves 308 bytes in the i2c core, and an average 1382 bytes
for chip drivers which use I2C_CLIENT_INSMOD, 126 bytes for those which
do not.

This change is required if we want to merge the sensors and non-sensors
i2c code (and we want to do this).

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>

Index: gregkh-2.6/Documentation/i2c/writing-clients
===================================================================
---
 Documentation/i2c/writing-clients       | 62 +++++++--------------------------
 drivers/acorn/char/pcf8583.c            |  3 --
 drivers/i2c/chips/isp1301_omap.c        |  1 -
 drivers/i2c/chips/m41t00.c              |  3 --
 drivers/i2c/chips/rtc8564.c             |  3 --
 drivers/i2c/i2c-core.c                  | 35 -------------------
 drivers/macintosh/therm_windtunnel.c    |  6 ++--
 drivers/media/video/adv7170.c           |  6 ----
 drivers/media/video/adv7175.c           |  6 ----
 drivers/media/video/bt819.c             |  6 ----
 drivers/media/video/bt832.c             |  4 +--
 drivers/media/video/bt856.c             |  6 ----
 drivers/media/video/msp3400.c           |  1 -
 drivers/media/video/saa5246a.c          |  1 -
 drivers/media/video/saa5249.c           |  1 -
 drivers/media/video/saa7110.c           |  6 ----
 drivers/media/video/saa7111.c           |  6 ----
 drivers/media/video/saa7114.c           |  6 ----
 drivers/media/video/saa7134/saa6752hs.c |  1 -
 drivers/media/video/saa7185.c           |  6 ----
 drivers/media/video/tda7432.c           |  1 -
 drivers/media/video/tda9840.c           |  1 -
 drivers/media/video/tda9875.c           |  1 -
 drivers/media/video/tda9887.c           |  1 -
 drivers/media/video/tea6415c.c          |  1 -
 drivers/media/video/tea6420.c           |  1 -
 drivers/media/video/tuner-3036.c        | 13 +++----
 drivers/media/video/tuner-core.c        | 11 +++---
 drivers/media/video/tvaudio.c           |  1 -
 drivers/media/video/tveeprom.c          |  1 -
 drivers/media/video/vpx3220.c           |  6 ----
 drivers/video/matrox/matroxfb_maven.c   |  1 -
 include/linux/i2c.h                     | 12 -------
 33 files changed, 27 insertions(+), 193 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/writing-clients b/Documentation/i2c/writing-clients
index ad27511e3c7d..f482dae81de3 100644
--- a/Documentation/i2c/writing-clients
+++ b/Documentation/i2c/writing-clients
@@ -171,45 +171,31 @@ The following lists are used internally:
 
   normal_i2c: filled in by the module writer. 
      A list of I2C addresses which should normally be examined.
-   normal_i2c_range: filled in by the module writer.
-     A list of pairs of I2C addresses, each pair being an inclusive range of
-     addresses which should normally be examined.
    probe: insmod parameter. 
      A list of pairs. The first value is a bus number (-1 for any I2C bus), 
      the second is the address. These addresses are also probed, as if they 
      were in the 'normal' list.
-   probe_range: insmod parameter. 
-     A list of triples. The first value is a bus number (-1 for any I2C bus), 
-     the second and third are addresses.  These form an inclusive range of 
-     addresses that are also probed, as if they were in the 'normal' list.
    ignore: insmod parameter.
      A list of pairs. The first value is a bus number (-1 for any I2C bus), 
      the second is the I2C address. These addresses are never probed. 
      This parameter overrules 'normal' and 'probe', but not the 'force' lists.
-   ignore_range: insmod parameter. 
-     A list of triples. The first value is a bus number (-1 for any I2C bus), 
-     the second and third are addresses. These form an inclusive range of 
-     I2C addresses that are never probed.
-     This parameter overrules 'normal' and 'probe', but not the 'force' lists.
    force: insmod parameter. 
      A list of pairs. The first value is a bus number (-1 for any I2C bus),
      the second is the I2C address. A device is blindly assumed to be on
      the given address, no probing is done. 
 
-Fortunately, as a module writer, you just have to define the `normal' 
-and/or `normal_range' parameters. The complete declaration could look
-like this:
+Fortunately, as a module writer, you just have to define the `normal_i2c' 
+parameter. The complete declaration could look like this:
 
-  /* Scan 0x20 to 0x2f, 0x37, and 0x40 to 0x4f */
-  static unsigned short normal_i2c[] = { 0x37,I2C_CLIENT_END }; 
-  static unsigned short normal_i2c_range[] = { 0x20, 0x2f, 0x40, 0x4f, 
-                                               I2C_CLIENT_END };
+  /* Scan 0x37, and 0x48 to 0x4f */
+  static unsigned short normal_i2c[] = { 0x37, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
+                                         0x4d, 0x4e, 0x4f, I2C_CLIENT_END };
 
   /* Magic definition of all other variables and things */
   I2C_CLIENT_INSMOD;
 
-Note that you *have* to call the two defined variables `normal_i2c' and
-`normal_i2c_range', without any prefix!
+Note that you *have* to call the defined variable `normal_i2c',
+without any prefix!
 
 
 Probing classes (sensors)
@@ -223,39 +209,17 @@ The following lists are used internally. They are all lists of integers.
 
    normal_i2c: filled in by the module writer. Terminated by SENSORS_I2C_END.
      A list of I2C addresses which should normally be examined.
-   normal_i2c_range: filled in by the module writer. Terminated by 
-     SENSORS_I2C_END
-     A list of pairs of I2C addresses, each pair being an inclusive range of
-     addresses which should normally be examined.
    normal_isa: filled in by the module writer. Terminated by SENSORS_ISA_END.
      A list of ISA addresses which should normally be examined.
-   normal_isa_range: filled in by the module writer. Terminated by 
-     SENSORS_ISA_END
-     A list of triples. The first two elements are ISA addresses, being an
-     range of addresses which should normally be examined. The third is the
-     modulo parameter: only addresses which are 0 module this value relative
-     to the first address of the range are actually considered.
    probe: insmod parameter. Initialize this list with SENSORS_I2C_END values.
      A list of pairs. The first value is a bus number (SENSORS_ISA_BUS for
      the ISA bus, -1 for any I2C bus), the second is the address. These
      addresses are also probed, as if they were in the 'normal' list.
-   probe_range: insmod parameter. Initialize this list with SENSORS_I2C_END 
-     values.
-     A list of triples. The first value is a bus number (SENSORS_ISA_BUS for
-     the ISA bus, -1 for any I2C bus), the second and third are addresses. 
-     These form an inclusive range of addresses that are also probed, as
-     if they were in the 'normal' list.
    ignore: insmod parameter. Initialize this list with SENSORS_I2C_END values.
      A list of pairs. The first value is a bus number (SENSORS_ISA_BUS for
      the ISA bus, -1 for any I2C bus), the second is the I2C address. These
      addresses are never probed. This parameter overrules 'normal' and 
      'probe', but not the 'force' lists.
-   ignore_range: insmod parameter. Initialize this list with SENSORS_I2C_END 
-      values.
-     A list of triples. The first value is a bus number (SENSORS_ISA_BUS for
-     the ISA bus, -1 for any I2C bus), the second and third are addresses. 
-     These form an inclusive range of I2C addresses that are never probed.
-     This parameter overrules 'normal' and 'probe', but not the 'force' lists.
 
 Also used is a list of pointers to sensors_force_data structures:
    force_data: insmod parameters. A list, ending with an element of which
@@ -269,16 +233,14 @@ Also used is a list of pointers to sensors_force_data structures:
 So we have a generic insmod variabled `force', and chip-specific variables
 `force_CHIPNAME'.
 
-Fortunately, as a module writer, you just have to define the `normal' 
-and/or `normal_range' parameters, and define what chip names are used. 
+Fortunately, as a module writer, you just have to define the `normal_i2c' 
+and `normal_isa' parameters, and define what chip names are used. 
 The complete declaration could look like this:
-  /* Scan i2c addresses 0x20 to 0x2f, 0x37, and 0x40 to 0x4f
-  static unsigned short normal_i2c[] = {0x37,SENSORS_I2C_END};
-  static unsigned short normal_i2c_range[] = {0x20,0x2f,0x40,0x4f,
-                                              SENSORS_I2C_END};
+  /* Scan i2c addresses 0x37, and 0x48 to 0x4f */
+  static unsigned short normal_i2c[] = { 0x37, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
+                                         0x4d, 0x4e, 0x4f, I2C_CLIENT_END };
   /* Scan ISA address 0x290 */
   static unsigned int normal_isa[] = {0x0290,SENSORS_ISA_END};
-  static unsigned int normal_isa_range[] = {SENSORS_ISA_END};
 
   /* Define chips foo and bar, as well as all module parameters and things */
   SENSORS_INSMOD_2(foo,bar);
diff --git a/drivers/acorn/char/pcf8583.c b/drivers/acorn/char/pcf8583.c
index ad7ae7ab8920..141b4c237a50 100644
--- a/drivers/acorn/char/pcf8583.c
+++ b/drivers/acorn/char/pcf8583.c
@@ -26,11 +26,8 @@ static unsigned short normal_addr[] = { 0x50, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_addr,
-	.normal_i2c_range	= ignore,
 	.probe			= ignore,
-	.probe_range		= ignore,
 	.ignore			= ignore,
-	.ignore_range		= ignore,
 	.force			= ignore,
 };
 
diff --git a/drivers/i2c/chips/isp1301_omap.c b/drivers/i2c/chips/isp1301_omap.c
index 7f29a8aff165..354a26295672 100644
--- a/drivers/i2c/chips/isp1301_omap.c
+++ b/drivers/i2c/chips/isp1301_omap.c
@@ -145,7 +145,6 @@ static inline void notresponding(struct isp1301 *isp)
 static unsigned short normal_i2c[] = {
 	ISP_BASE, ISP_BASE + 1,
 	I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 I2C_CLIENT_INSMOD;
 
diff --git a/drivers/i2c/chips/m41t00.c b/drivers/i2c/chips/m41t00.c
index e771566dffa8..5e463c47bfbc 100644
--- a/drivers/i2c/chips/m41t00.c
+++ b/drivers/i2c/chips/m41t00.c
@@ -40,11 +40,8 @@ static unsigned short normal_addr[] = { 0x68, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_addr,
-	.normal_i2c_range	= ignore,
 	.probe			= ignore,
-	.probe_range		= ignore,
 	.ignore			= ignore,
-	.ignore_range		= ignore,
 	.force			= ignore,
 };
 
diff --git a/drivers/i2c/chips/rtc8564.c b/drivers/i2c/chips/rtc8564.c
index 5a9deddb626b..30f553e73700 100644
--- a/drivers/i2c/chips/rtc8564.c
+++ b/drivers/i2c/chips/rtc8564.c
@@ -66,11 +66,8 @@ static unsigned short normal_addr[] = { 0x51, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_addr,
-	.normal_i2c_range	= ignore,
 	.probe			= ignore,
-	.probe_range		= ignore,
 	.ignore			= ignore,
-	.ignore_range		= ignore,
 	.force			= ignore,
 };
 
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index a22e53badacb..4cc8c9f7211c 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -742,18 +742,6 @@ int i2c_probe(struct i2c_adapter *adapter,
 				found = 1;
 			}
 		}
-		for (i = 0;
-		     !found && (address_data->ignore_range[i] != I2C_CLIENT_END);
-		     i += 3) {
-			if (((adap_id == address_data->ignore_range[i]) ||
-			    ((address_data->ignore_range[i]==ANY_I2C_BUS))) &&
-			    (addr >= address_data->ignore_range[i+1]) &&
-			    (addr <= address_data->ignore_range[i+2])) {
-				dev_dbg(&adapter->dev, "found ignore_range parameter for adapter %d, "
-					"addr %04x\n", adap_id,addr);
-				found = 1;
-			}
-		}
 		if (found) 
 			continue;
 
@@ -769,17 +757,6 @@ int i2c_probe(struct i2c_adapter *adapter,
 			}
 		}
 
-		for (i = 0;
-		     !found && (address_data->normal_i2c_range[i] != I2C_CLIENT_END);
-		     i += 2) {
-			if ((addr >= address_data->normal_i2c_range[i]) &&
-			    (addr <= address_data->normal_i2c_range[i+1])) {
-				found = 1;
-				dev_dbg(&adapter->dev, "found normal i2c_range entry for adapter %d, "
-					"addr %04x\n", adap_id,addr);
-			}
-		}
-
 		for (i = 0;
 		     !found && (address_data->probe[i] != I2C_CLIENT_END);
 		     i += 2) {
@@ -791,18 +768,6 @@ int i2c_probe(struct i2c_adapter *adapter,
 					"addr %04x\n", adap_id,addr);
 			}
 		}
-		for (i = 0;
-		     !found && (address_data->probe_range[i] != I2C_CLIENT_END);
-		     i += 3) {
-			if (((adap_id == address_data->probe_range[i]) ||
-			   (address_data->probe_range[i] == ANY_I2C_BUS)) &&
-			   (addr >= address_data->probe_range[i+1]) &&
-			   (addr <= address_data->probe_range[i+2])) {
-				found = 1;
-				dev_dbg(&adapter->dev, "found probe_range parameter for adapter %d, "
-					"addr %04x\n", adap_id,addr);
-			}
-		}
 		if (!found) 
 			continue;
 
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 0bdb47f08c2a..61400f04015e 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -51,8 +51,10 @@
 static int 			do_probe( struct i2c_adapter *adapter, int addr, int kind);
 
 /* scan 0x48-0x4f (DS1775) and 0x2c-2x2f (ADM1030) */
-static unsigned short		normal_i2c[] = { 0x49, 0x2c, I2C_CLIENT_END };
-static unsigned short		normal_i2c_range[] = { 0x48, 0x4f, 0x2c, 0x2f, I2C_CLIENT_END };
+static unsigned short		normal_i2c[] = { 0x48, 0x49, 0x4a, 0x4b,
+						 0x4c, 0x4d, 0x4e, 0x4f,
+						 0x2c, 0x2d, 0x2e, 0x2f,
+						 I2C_CLIENT_END };
 
 I2C_CLIENT_INSMOD;
 
diff --git a/drivers/media/video/adv7170.c b/drivers/media/video/adv7170.c
index 80254caa444c..e9bf3394296a 100644
--- a/drivers/media/video/adv7170.c
+++ b/drivers/media/video/adv7170.c
@@ -384,21 +384,15 @@ static unsigned short normal_i2c[] =
 	I2C_ADV7171 >> 1, (I2C_ADV7171 >> 1) + 1,
 	I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/adv7175.c b/drivers/media/video/adv7175.c
index 95d0974b0ab5..2d5fa44fcd4d 100644
--- a/drivers/media/video/adv7175.c
+++ b/drivers/media/video/adv7175.c
@@ -434,21 +434,15 @@ static unsigned short normal_i2c[] =
 	I2C_ADV7176 >> 1, (I2C_ADV7176 >> 1) + 1,
 	I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/bt819.c b/drivers/media/video/bt819.c
index cf0db2554a80..31d51851bb44 100644
--- a/drivers/media/video/bt819.c
+++ b/drivers/media/video/bt819.c
@@ -500,21 +500,15 @@ static unsigned short normal_i2c[] = {
 	I2C_BT819 >> 1,
 	I2C_CLIENT_END,
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/bt832.c b/drivers/media/video/bt832.c
index efe605a113a1..07f72f64c5f7 100644
--- a/drivers/media/video/bt832.c
+++ b/drivers/media/video/bt832.c
@@ -39,8 +39,8 @@
 MODULE_LICENSE("GPL");
 
 /* Addresses to scan */
-static unsigned short normal_i2c[] = {I2C_CLIENT_END};
-static unsigned short normal_i2c_range[] = {I2C_BT832_ALT1>>1,I2C_BT832_ALT2>>1,I2C_CLIENT_END};
+static unsigned short normal_i2c[] = { I2C_BT832_ALT1>>1, I2C_BT832_ALT2>>1,
+				       I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/media/video/bt856.c b/drivers/media/video/bt856.c
index 72c7eb0f8c24..59121a0ec816 100644
--- a/drivers/media/video/bt856.c
+++ b/drivers/media/video/bt856.c
@@ -288,21 +288,15 @@ bt856_command (struct i2c_client *client,
  * concerning the addresses: i2c wants 7 bit (without the r/w bit), so '>>1'
  */
 static unsigned short normal_i2c[] = { I2C_BT856 >> 1, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 7fbb8581a87d..09464d624a6b 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -147,7 +147,6 @@ static unsigned short normal_i2c[] = {
 	I2C_MSP3400C_ALT  >> 1,
 	I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = {I2C_CLIENT_END,I2C_CLIENT_END};
 I2C_CLIENT_INSMOD;
 
 /* ----------------------------------------------------------------------- */
diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c
index ba69f09cbdd1..b8054da31ffd 100644
--- a/drivers/media/video/saa5246a.c
+++ b/drivers/media/video/saa5246a.c
@@ -64,7 +64,6 @@ static struct video_device saa_template;	/* Declared near bottom */
 
 /* Addresses to scan */
 static unsigned short normal_i2c[]	 = { I2C_ADDRESS, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 static struct i2c_client client_template;
diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c
index d74caa139f0a..7ffa2e9a9bf3 100644
--- a/drivers/media/video/saa5249.c
+++ b/drivers/media/video/saa5249.c
@@ -132,7 +132,6 @@ static struct video_device saa_template;	/* Declared near bottom */
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = {34>>1,I2C_CLIENT_END};
-static unsigned short normal_i2c_range[] = {I2C_CLIENT_END};
 I2C_CLIENT_INSMOD;
 
 static struct i2c_client client_template;
diff --git a/drivers/media/video/saa7110.c b/drivers/media/video/saa7110.c
index 64273b438530..90b0a0b34f38 100644
--- a/drivers/media/video/saa7110.c
+++ b/drivers/media/video/saa7110.c
@@ -463,21 +463,15 @@ static unsigned short normal_i2c[] = {
 	(I2C_SAA7110 >> 1) + 1,
 	I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/saa7111.c b/drivers/media/video/saa7111.c
index 0a873112ae23..e305a89f7cd7 100644
--- a/drivers/media/video/saa7111.c
+++ b/drivers/media/video/saa7111.c
@@ -482,21 +482,15 @@ saa7111_command (struct i2c_client *client,
  * concerning the addresses: i2c wants 7 bit (without the r/w bit), so '>>1'
  */
 static unsigned short normal_i2c[] = { I2C_SAA7111 >> 1, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/saa7114.c b/drivers/media/video/saa7114.c
index e73023695e58..1ca4e70fed76 100644
--- a/drivers/media/video/saa7114.c
+++ b/drivers/media/video/saa7114.c
@@ -820,21 +820,15 @@ saa7114_command (struct i2c_client *client,
  */
 static unsigned short normal_i2c[] =
     { I2C_SAA7114 >> 1, I2C_SAA7114A >> 1, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/saa7134/saa6752hs.c b/drivers/media/video/saa7134/saa6752hs.c
index 1db022682980..42c2b565c9fe 100644
--- a/drivers/media/video/saa7134/saa6752hs.c
+++ b/drivers/media/video/saa7134/saa6752hs.c
@@ -22,7 +22,6 @@
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = {0x20, I2C_CLIENT_END};
-static unsigned short normal_i2c_range[] = {I2C_CLIENT_END};
 I2C_CLIENT_INSMOD;
 
 MODULE_DESCRIPTION("device driver for saa6752hs MPEG2 encoder");
diff --git a/drivers/media/video/saa7185.c b/drivers/media/video/saa7185.c
index 5f0b224c3cb6..5c623fadc8fe 100644
--- a/drivers/media/video/saa7185.c
+++ b/drivers/media/video/saa7185.c
@@ -380,21 +380,15 @@ saa7185_command (struct i2c_client *client,
  * concerning the addresses: i2c wants 7 bit (without the r/w bit), so '>>1'
  */
 static unsigned short normal_i2c[] = { I2C_SAA7185 >> 1, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/media/video/tda7432.c b/drivers/media/video/tda7432.c
index 376a4a439e9b..07ba6d3ed08c 100644
--- a/drivers/media/video/tda7432.c
+++ b/drivers/media/video/tda7432.c
@@ -74,7 +74,6 @@ static unsigned short normal_i2c[] = {
 	I2C_TDA7432 >> 1,
 	I2C_CLIENT_END,
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END, I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 /* Structure of address and subaddresses for the tda7432 */
diff --git a/drivers/media/video/tda9840.c b/drivers/media/video/tda9840.c
index b5177c6f54f6..c29bdfc3244e 100644
--- a/drivers/media/video/tda9840.c
+++ b/drivers/media/video/tda9840.c
@@ -43,7 +43,6 @@ MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 
 /* addresses to scan, found only at 0x42 (7-Bit) */
 static unsigned short normal_i2c[] = { I2C_TDA9840, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 /* magic definition of all other variables and things */
 I2C_CLIENT_INSMOD;
diff --git a/drivers/media/video/tda9875.c b/drivers/media/video/tda9875.c
index 4f1114c033a1..97b113e070f3 100644
--- a/drivers/media/video/tda9875.c
+++ b/drivers/media/video/tda9875.c
@@ -44,7 +44,6 @@ static unsigned short normal_i2c[] =  {
     I2C_TDA9875 >> 1,
     I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = {I2C_CLIENT_END};
 I2C_CLIENT_INSMOD;
 
 /* This is a superset of the TDA9875 */
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index debef1910c37..7e6e6dd966a2 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -33,7 +33,6 @@ static unsigned short normal_i2c[] = {
 	0x96 >>1,
 	I2C_CLIENT_END,
 };
-static unsigned short normal_i2c_range[] = {I2C_CLIENT_END,I2C_CLIENT_END};
 I2C_CLIENT_INSMOD;
 
 /* insmod options */
diff --git a/drivers/media/video/tea6415c.c b/drivers/media/video/tea6415c.c
index 3ec39550bf46..b44db8a7b94d 100644
--- a/drivers/media/video/tea6415c.c
+++ b/drivers/media/video/tea6415c.c
@@ -43,7 +43,6 @@ MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 
 /* addresses to scan, found only at 0x03 and/or 0x43 (7-bit) */
 static unsigned short normal_i2c[] = { I2C_TEA6415C_1, I2C_TEA6415C_2, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 /* magic definition of all other variables and things */
 I2C_CLIENT_INSMOD;
diff --git a/drivers/media/video/tea6420.c b/drivers/media/video/tea6420.c
index bd10710fd909..48d4db7d507b 100644
--- a/drivers/media/video/tea6420.c
+++ b/drivers/media/video/tea6420.c
@@ -40,7 +40,6 @@ MODULE_PARM_DESC(debug, "Turn on/off device debugging (default:off).");
 
 /* addresses to scan, found only at 0x4c and/or 0x4d (7-Bit) */
 static unsigned short normal_i2c[] = { I2C_TEA6420_1, I2C_TEA6420_2, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 /* magic definition of all other variables and things */
 I2C_CLIENT_INSMOD;
diff --git a/drivers/media/video/tuner-3036.c b/drivers/media/video/tuner-3036.c
index 6b20aa902a8f..bedb15e2f233 100644
--- a/drivers/media/video/tuner-3036.c
+++ b/drivers/media/video/tuner-3036.c
@@ -34,19 +34,16 @@ static int this_adap;
 static struct i2c_client client_template;
 
 /* Addresses to scan */
-static unsigned short normal_i2c[] = {I2C_CLIENT_END};
-static unsigned short normal_i2c_range[] = {0x60, 0x61, I2C_CLIENT_END};
+static unsigned short normal_i2c[] = { 0x60, 0x61, I2C_CLIENT_END };
 static unsigned short probe[2]        = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2]  = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2]       = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2]        = { I2C_CLIENT_END, I2C_CLIENT_END };
 
 static struct i2c_client_address_data addr_data = {
-	normal_i2c, normal_i2c_range, 
-	probe, probe_range, 
-	ignore, ignore_range, 
-	force
+	.normal_i2c	= normal_i2c,
+	.probe		= probe,
+	.ignore		= ignore,
+	.force		= force,
 };
 
 /* ---------------------------------------------------------------------- */
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index 6212388edb75..81882ddab859 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -28,10 +28,8 @@
 /* standard i2c insmod options */
 static unsigned short normal_i2c[] = {
 	0x4b, /* tda8290 */
-	I2C_CLIENT_END
-};
-static unsigned short normal_i2c_range[] = {
-	0x60, 0x6f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
 	I2C_CLIENT_END
 };
 I2C_CLIENT_INSMOD;
@@ -225,9 +223,8 @@ static int tuner_attach(struct i2c_adapter *adap, int addr, int kind)
 static int tuner_probe(struct i2c_adapter *adap)
 {
 	if (0 != addr) {
-		normal_i2c[0]       = addr;
-		normal_i2c_range[0] = addr;
-		normal_i2c_range[1] = addr;
+		normal_i2c[0] = addr;
+		normal_i2c[1] = I2C_CLIENT_END;
 	}
 	this_adap = 0;
 
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 80dc34f18c2c..41b635e0d3c6 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -148,7 +148,6 @@ static unsigned short normal_i2c[] = {
 	I2C_TDA9874   >> 1,
 	I2C_PIC16C54  >> 1,
 	I2C_CLIENT_END };
-static unsigned short normal_i2c_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 static struct i2c_driver driver;
diff --git a/drivers/media/video/tveeprom.c b/drivers/media/video/tveeprom.c
index e1443a0937e3..3d216973798c 100644
--- a/drivers/media/video/tveeprom.c
+++ b/drivers/media/video/tveeprom.c
@@ -482,7 +482,6 @@ static unsigned short normal_i2c[] = {
 	0xa0 >> 1,
 	I2C_CLIENT_END,
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 struct i2c_driver i2c_driver_tveeprom;
diff --git a/drivers/media/video/vpx3220.c b/drivers/media/video/vpx3220.c
index 0fd6c9a70917..b97036910fa9 100644
--- a/drivers/media/video/vpx3220.c
+++ b/drivers/media/video/vpx3220.c
@@ -569,21 +569,15 @@ static unsigned short normal_i2c[] =
     { I2C_VPX3220 >> 1, (I2C_VPX3220 >> 1) + 4,
 	I2C_CLIENT_END
 };
-static unsigned short normal_i2c_range[] = { I2C_CLIENT_END };
 
 static unsigned short probe[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short probe_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short ignore[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
-static unsigned short ignore_range[2] = { I2C_CLIENT_END, I2C_CLIENT_END };
 static unsigned short force[2] = { I2C_CLIENT_END , I2C_CLIENT_END };
                                                                                 
 static struct i2c_client_address_data addr_data = {
 	.normal_i2c		= normal_i2c,
-	.normal_i2c_range	= normal_i2c_range,
 	.probe			= probe,
-	.probe_range		= probe_range,
 	.ignore			= ignore,
-	.ignore_range		= ignore_range,
 	.force			= force
 };
 
diff --git a/drivers/video/matrox/matroxfb_maven.c b/drivers/video/matrox/matroxfb_maven.c
index e529841cd83d..67f85344f0cc 100644
--- a/drivers/video/matrox/matroxfb_maven.c
+++ b/drivers/video/matrox/matroxfb_maven.c
@@ -1230,7 +1230,6 @@ static int maven_shutdown_client(struct i2c_client* clnt) {
 }
 
 static unsigned short normal_i2c[] = { MAVEN_I2CID, I2C_CLIENT_END };
-static unsigned short normal_i2c_range[] = { MAVEN_I2CID, MAVEN_I2CID, I2C_CLIENT_END };
 I2C_CLIENT_INSMOD;
 
 static struct i2c_driver maven_driver;
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index ebcd745f4cd6..be837b13f297 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -290,11 +290,8 @@ static inline void i2c_set_adapdata (struct i2c_adapter *dev, void *data)
  */
 struct i2c_client_address_data {
 	unsigned short *normal_i2c;
-	unsigned short *normal_i2c_range;
 	unsigned short *probe;
-	unsigned short *probe_range;
 	unsigned short *ignore;
-	unsigned short *ignore_range;
 	unsigned short *force;
 };
 
@@ -563,24 +560,15 @@ union i2c_smbus_data {
 #define I2C_CLIENT_INSMOD \
   I2C_CLIENT_MODULE_PARM(probe, \
                       "List of adapter,address pairs to scan additionally"); \
-  I2C_CLIENT_MODULE_PARM(probe_range, \
-                      "List of adapter,start-addr,end-addr triples to scan " \
-                      "additionally"); \
   I2C_CLIENT_MODULE_PARM(ignore, \
                       "List of adapter,address pairs not to scan"); \
-  I2C_CLIENT_MODULE_PARM(ignore_range, \
-                      "List of adapter,start-addr,end-addr triples not to " \
-                      "scan"); \
   I2C_CLIENT_MODULE_PARM(force, \
                       "List of adapter,address pairs to boldly assume " \
                       "to be present"); \
 	static struct i2c_client_address_data addr_data = {		\
 			.normal_i2c = 		normal_i2c,		\
-			.normal_i2c_range =	normal_i2c_range,	\
 			.probe =		probe,			\
-			.probe_range =		probe_range,		\
 			.ignore =		ignore,			\
-			.ignore_range =		ignore_range,		\
 			.force =		force,			\
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 3886246a257e828248ce1e72ced00408a3557f0d Mon Sep 17 00:00:00 2001
From: Sebastian Witt <se.witt@gmx.net>
Date: Wed, 13 Apr 2005 22:25:39 +0200
Subject: [PATCH] I2C: i2c-vid.h: Support for VID to reg conversion

Adds conversion from VID (mV) to register value. Used by the atxp1 I2C module.
Removed uneeded switch case.

Signed-off-by: Sebastian Witt <se.witt@gmx.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/i2c-vid.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-vid.h b/include/linux/i2c-vid.h
index 974835e3530f..41d0635e0ba9 100644
--- a/include/linux/i2c-vid.h
+++ b/include/linux/i2c-vid.h
@@ -97,3 +97,15 @@ static inline int vid_from_reg(int val, int vrm)
 		                     2050 - (val) * 50);
 	}
 }
+
+static inline int vid_to_reg(int val, int vrm)
+{
+	switch (vrm) {
+	case 91:		/* VRM 9.1 */
+	case 90:		/* VRM 9.0 */
+		return ((val >= 1100) && (val <= 1850) ?
+			((18499 - val * 10) / 25 + 5) / 10 : -1);
+	default:
+		return -1;
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b


From 10c08f8100ee2c4d27b862635574cdf4ef439e67 Mon Sep 17 00:00:00 2001
From: Jean Delvare <khali@linux-fr.org>
Date: Mon, 6 Jun 2005 19:34:45 +0200
Subject: [PATCH] I2C: rename i2c-sysfs.h to hwmon-sysfs.h

This patch renames the new linux/i2c-sysfs.h header file to
linux/hwmon-sysfs.h. This names seems to be more appropriate since this
file defines macros and structures not related to i2c but to hardware
monitoring drivers. The patch also updates the five hardware monitoring
driver which include that header file already.

Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/chips/adm1026.c |  2 +-
 drivers/i2c/chips/it87.c    |  2 +-
 drivers/i2c/chips/lm63.c    |  2 +-
 drivers/i2c/chips/lm83.c    |  2 +-
 drivers/i2c/chips/lm90.c    |  2 +-
 include/linux/hwmon-sysfs.h | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/i2c-sysfs.h   | 36 ------------------------------------
 7 files changed, 41 insertions(+), 41 deletions(-)
 create mode 100644 include/linux/hwmon-sysfs.h
 delete mode 100644 include/linux/i2c-sysfs.h

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/adm1026.c b/drivers/i2c/chips/adm1026.c
index ddbc01505ed3..3c85fe150cd7 100644
--- a/drivers/i2c/chips/adm1026.c
+++ b/drivers/i2c/chips/adm1026.c
@@ -29,8 +29,8 @@
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/i2c-sensor.h>
-#include <linux/i2c-sysfs.h>
 #include <linux/i2c-vid.h>
+#include <linux/hwmon-sysfs.h>
 
 /* Addresses to scan */
 static unsigned short normal_i2c[] = { 0x2c, 0x2d, 0x2e, I2C_CLIENT_END };
diff --git a/drivers/i2c/chips/it87.c b/drivers/i2c/chips/it87.c
index 6a9b65a10bbc..db20c9e47393 100644
--- a/drivers/i2c/chips/it87.c
+++ b/drivers/i2c/chips/it87.c
@@ -37,8 +37,8 @@
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/i2c-sensor.h>
-#include <linux/i2c-sysfs.h>
 #include <linux/i2c-vid.h>
+#include <linux/hwmon-sysfs.h>
 #include <asm/io.h>
 
 
diff --git a/drivers/i2c/chips/lm63.c b/drivers/i2c/chips/lm63.c
index a1fd12bd615f..7c6f9ea5a254 100644
--- a/drivers/i2c/chips/lm63.c
+++ b/drivers/i2c/chips/lm63.c
@@ -43,7 +43,7 @@
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/i2c-sensor.h>
-#include <linux/i2c-sysfs.h>
+#include <linux/hwmon-sysfs.h>
 
 /*
  * Addresses to scan
diff --git a/drivers/i2c/chips/lm83.c b/drivers/i2c/chips/lm83.c
index 0e0eae4dceaa..a49008b444c8 100644
--- a/drivers/i2c/chips/lm83.c
+++ b/drivers/i2c/chips/lm83.c
@@ -33,7 +33,7 @@
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/i2c-sensor.h>
-#include <linux/i2c-sysfs.h>
+#include <linux/hwmon-sysfs.h>
 
 /*
  * Addresses to scan
diff --git a/drivers/i2c/chips/lm90.c b/drivers/i2c/chips/lm90.c
index ebd99dfbf9c7..a67dcadf7cb0 100644
--- a/drivers/i2c/chips/lm90.c
+++ b/drivers/i2c/chips/lm90.c
@@ -76,7 +76,7 @@
 #include <linux/jiffies.h>
 #include <linux/i2c.h>
 #include <linux/i2c-sensor.h>
-#include <linux/i2c-sysfs.h>
+#include <linux/hwmon-sysfs.h>
 
 /*
  * Addresses to scan
diff --git a/include/linux/hwmon-sysfs.h b/include/linux/hwmon-sysfs.h
new file mode 100644
index 000000000000..1b5018a965f5
--- /dev/null
+++ b/include/linux/hwmon-sysfs.h
@@ -0,0 +1,36 @@
+/*
+ *  hwmon-sysfs.h - hardware monitoring chip driver sysfs defines
+ *
+ *  Copyright (C) 2005 Yani Ioannou <yani.ioannou@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_HWMON_SYSFS_H
+#define _LINUX_HWMON_SYSFS_H
+
+struct sensor_device_attribute{
+	struct device_attribute dev_attr;
+	int index;
+};
+#define to_sensor_dev_attr(_dev_attr) \
+	container_of(_dev_attr, struct sensor_device_attribute, dev_attr)
+
+#define SENSOR_DEVICE_ATTR(_name,_mode,_show,_store,_index)	\
+struct sensor_device_attribute sensor_dev_attr_##_name = {	\
+	.dev_attr =	__ATTR(_name,_mode,_show,_store),	\
+	.index =	_index,					\
+}
+
+#endif /* _LINUX_HWMON_SYSFS_H */
diff --git a/include/linux/i2c-sysfs.h b/include/linux/i2c-sysfs.h
deleted file mode 100644
index d7bf6ce11679..000000000000
--- a/include/linux/i2c-sysfs.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- *  i2c-sysfs.h - i2c chip driver sysfs defines
- *
- *  Copyright (C) 2005 Yani Ioannou <yani.ioannou@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _LINUX_I2C_SYSFS_H
-#define _LINUX_I2C_SYSFS_H
-
-struct sensor_device_attribute{
-	struct device_attribute dev_attr;
-	int index;
-};
-#define to_sensor_dev_attr(_dev_attr) \
-	container_of(_dev_attr, struct sensor_device_attribute, dev_attr)
-
-#define SENSOR_DEVICE_ATTR(_name,_mode,_show,_store,_index)	\
-struct sensor_device_attribute sensor_dev_attr_##_name = {	\
-	.dev_attr =	__ATTR(_name,_mode,_show,_store),	\
-	.index =	_index,					\
-}
-
-#endif /* _LINUX_I2C_SYSFS_H */
-- 
cgit v1.2.3-59-g8ed1b


From c124a78d8c7475ecc43f385f34112b638c4228d9 Mon Sep 17 00:00:00 2001
From: Randy Vinson <rvinson@mvista.com>
Date: Fri, 3 Jun 2005 14:36:06 -0700
Subject: [PATCH] I2C: Add support for Maxim/Dallas DS1374 Real-Time Clock Chip
 (1/2)

Add support for Maxim/Dallas DS1374 Real-Time Clock Chip

This change adds support for the Maxim/Dallas DS1374 RTC chip. This chip
is an I2C-based RTC that maintains a simple 32-bit binary seconds count
with battery backup support.

Signed-off-by: Randy Vinson <rvinson@mvista.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/i2c/chips/Kconfig  |  11 ++
 drivers/i2c/chips/Makefile |   1 +
 drivers/i2c/chips/ds1374.c | 266 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c-id.h     |   1 +
 4 files changed, 279 insertions(+)
 create mode 100644 drivers/i2c/chips/ds1374.c

(limited to 'include/linux')

diff --git a/drivers/i2c/chips/Kconfig b/drivers/i2c/chips/Kconfig
index 33de80afd6c6..a0982da09803 100644
--- a/drivers/i2c/chips/Kconfig
+++ b/drivers/i2c/chips/Kconfig
@@ -417,6 +417,17 @@ config SENSORS_DS1337
 	  This driver can also be built as a module.  If so, the module
 	  will be called ds1337.
 
+config SENSORS_DS1374
+	tristate "Maxim/Dallas Semiconductor DS1374 Real Time Clock"
+	depends on I2C && EXPERIMENTAL
+	select I2C_SENSOR
+	help
+	  If you say yes here you get support for Dallas Semiconductor
+	  DS1374 real-time clock chips.
+
+	  This driver can also be built as a module.  If so, the module
+	  will be called ds1374.
+
 config SENSORS_EEPROM
 	tristate "EEPROM reader"
 	depends on I2C && EXPERIMENTAL
diff --git a/drivers/i2c/chips/Makefile b/drivers/i2c/chips/Makefile
index 6bebdc104166..b5e6d2f84f97 100644
--- a/drivers/i2c/chips/Makefile
+++ b/drivers/i2c/chips/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_SENSORS_ADM1031)	+= adm1031.o
 obj-$(CONFIG_SENSORS_ADM9240)	+= adm9240.o
 obj-$(CONFIG_SENSORS_ATXP1)	+= atxp1.o
 obj-$(CONFIG_SENSORS_DS1337)	+= ds1337.o
+obj-$(CONFIG_SENSORS_DS1374)	+= ds1374.o
 obj-$(CONFIG_SENSORS_DS1621)	+= ds1621.o
 obj-$(CONFIG_SENSORS_EEPROM)	+= eeprom.o
 obj-$(CONFIG_SENSORS_FSCHER)	+= fscher.o
diff --git a/drivers/i2c/chips/ds1374.c b/drivers/i2c/chips/ds1374.c
new file mode 100644
index 000000000000..1278d979db2b
--- /dev/null
+++ b/drivers/i2c/chips/ds1374.c
@@ -0,0 +1,266 @@
+/*
+ * drivers/i2c/chips/ds1374.c
+ *
+ * I2C client/driver for the Maxim/Dallas DS1374 Real-Time Clock
+ *
+ * Author: Randy Vinson <rvinson@mvista.com>
+ *
+ * Based on the m41t00.c by Mark Greer <mgreer@mvista.com>
+ *
+ * 2005 (c) MontaVista Software, Inc. This file is licensed under
+ * the terms of the GNU General Public License version 2. This program
+ * is licensed "as is" without any warranty of any kind, whether express
+ * or implied.
+ */
+/*
+ * This i2c client/driver wedges between the drivers/char/genrtc.c RTC
+ * interface and the SMBus interface of the i2c subsystem.
+ * It would be more efficient to use i2c msgs/i2c_transfer directly but, as
+ * recommened in .../Documentation/i2c/writing-clients section
+ * "Sending and receiving", using SMBus level communication is preferred.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/i2c.h>
+#include <linux/rtc.h>
+#include <linux/bcd.h>
+
+#include <asm/time.h>
+#include <asm/rtc.h>
+
+#define DS1374_REG_TOD0		0x00
+#define DS1374_REG_TOD1		0x01
+#define DS1374_REG_TOD2		0x02
+#define DS1374_REG_TOD3		0x03
+#define DS1374_REG_WDALM0	0x04
+#define DS1374_REG_WDALM1	0x05
+#define DS1374_REG_WDALM2	0x06
+#define DS1374_REG_CR		0x07
+#define DS1374_REG_SR		0x08
+#define DS1374_REG_SR_OSF	0x80
+#define DS1374_REG_TCR		0x09
+
+#define	DS1374_DRV_NAME		"ds1374"
+
+static DECLARE_MUTEX(ds1374_mutex);
+
+static struct i2c_driver ds1374_driver;
+static struct i2c_client *save_client;
+
+static unsigned short ignore[] = { I2C_CLIENT_END };
+static unsigned short normal_addr[] = { 0x68, I2C_CLIENT_END };
+
+static struct i2c_client_address_data addr_data = {
+	.normal_i2c = normal_addr,
+	.normal_i2c_range = ignore,
+	.probe = ignore,
+	.probe_range = ignore,
+	.ignore = ignore,
+	.ignore_range = ignore,
+	.force = ignore,
+};
+
+static ulong ds1374_read_rtc(void)
+{
+	ulong time = 0;
+	int reg = DS1374_REG_WDALM0;
+
+	while (reg--) {
+		s32 tmp;
+		if ((tmp = i2c_smbus_read_byte_data(save_client, reg)) < 0) {
+			dev_warn(&save_client->dev,
+				 "can't read from rtc chip\n");
+			return 0;
+		}
+		time = (time << 8) | (tmp & 0xff);
+	}
+	return time;
+}
+
+static void ds1374_write_rtc(ulong time)
+{
+	int reg;
+
+	for (reg = DS1374_REG_TOD0; reg < DS1374_REG_WDALM0; reg++) {
+		if (i2c_smbus_write_byte_data(save_client, reg, time & 0xff)
+		    < 0) {
+			dev_warn(&save_client->dev,
+				 "can't write to rtc chip\n");
+			break;
+		}
+		time = time >> 8;
+	}
+}
+
+static void ds1374_check_rtc_status(void)
+{
+	s32 tmp;
+
+	tmp = i2c_smbus_read_byte_data(save_client, DS1374_REG_SR);
+	if (tmp < 0) {
+		dev_warn(&save_client->dev,
+			 "can't read status from rtc chip\n");
+		return;
+	}
+	if (tmp & DS1374_REG_SR_OSF) {
+		dev_warn(&save_client->dev,
+			 "oscillator discontinuity flagged, time unreliable\n");
+		tmp &= ~DS1374_REG_SR_OSF;
+		tmp = i2c_smbus_write_byte_data(save_client, DS1374_REG_SR,
+						tmp & 0xff);
+		if (tmp < 0)
+			dev_warn(&save_client->dev,
+				 "can't clear discontinuity notification\n");
+	}
+}
+
+ulong ds1374_get_rtc_time(void)
+{
+	ulong t1, t2;
+	int limit = 10;		/* arbitrary retry limit */
+
+	down(&ds1374_mutex);
+
+	/*
+	 * Since the reads are being performed one byte at a time using
+	 * the SMBus vs a 4-byte i2c transfer, there is a chance that a
+	 * carry will occur during the read. To detect this, 2 reads are
+	 * performed and compared.
+	 */
+	do {
+		t1 = ds1374_read_rtc();
+		t2 = ds1374_read_rtc();
+	} while (t1 != t2 && limit--);
+
+	up(&ds1374_mutex);
+
+	if (t1 != t2) {
+		dev_warn(&save_client->dev,
+			 "can't get consistent time from rtc chip\n");
+		t1 = 0;
+	}
+
+	return t1;
+}
+
+static void ds1374_set_tlet(ulong arg)
+{
+	ulong t1, t2;
+	int limit = 10;		/* arbitrary retry limit */
+
+	t1 = *(ulong *) arg;
+
+	down(&ds1374_mutex);
+
+	/*
+	 * Since the writes are being performed one byte at a time using
+	 * the SMBus vs a 4-byte i2c transfer, there is a chance that a
+	 * carry will occur during the write. To detect this, the write
+	 * value is read back and compared.
+	 */
+	do {
+		ds1374_write_rtc(t1);
+		t2 = ds1374_read_rtc();
+	} while (t1 != t2 && limit--);
+
+	up(&ds1374_mutex);
+
+	if (t1 != t2)
+		dev_warn(&save_client->dev,
+			 "can't confirm time set from rtc chip\n");
+}
+
+ulong new_time;
+
+DECLARE_TASKLET_DISABLED(ds1374_tasklet, ds1374_set_tlet, (ulong) & new_time);
+
+int ds1374_set_rtc_time(ulong nowtime)
+{
+	new_time = nowtime;
+
+	if (in_interrupt())
+		tasklet_schedule(&ds1374_tasklet);
+	else
+		ds1374_set_tlet((ulong) & new_time);
+
+	return 0;
+}
+
+/*
+ *****************************************************************************
+ *
+ *	Driver Interface
+ *
+ *****************************************************************************
+ */
+static int ds1374_probe(struct i2c_adapter *adap, int addr, int kind)
+{
+	struct i2c_client *client;
+	int rc;
+
+	client = kmalloc(sizeof(struct i2c_client), GFP_KERNEL);
+	if (!client)
+		return -ENOMEM;
+
+	memset(client, 0, sizeof(struct i2c_client));
+	strncpy(client->name, DS1374_DRV_NAME, I2C_NAME_SIZE);
+	client->flags = I2C_DF_NOTIFY;
+	client->addr = addr;
+	client->adapter = adap;
+	client->driver = &ds1374_driver;
+
+	if ((rc = i2c_attach_client(client)) != 0) {
+		kfree(client);
+		return rc;
+	}
+
+	save_client = client;
+
+	ds1374_check_rtc_status();
+
+	return 0;
+}
+
+static int ds1374_attach(struct i2c_adapter *adap)
+{
+	return i2c_probe(adap, &addr_data, ds1374_probe);
+}
+
+static int ds1374_detach(struct i2c_client *client)
+{
+	int rc;
+
+	if ((rc = i2c_detach_client(client)) == 0) {
+		kfree(i2c_get_clientdata(client));
+		tasklet_kill(&ds1374_tasklet);
+	}
+	return rc;
+}
+
+static struct i2c_driver ds1374_driver = {
+	.owner = THIS_MODULE,
+	.name = DS1374_DRV_NAME,
+	.id = I2C_DRIVERID_DS1374,
+	.flags = I2C_DF_NOTIFY,
+	.attach_adapter = ds1374_attach,
+	.detach_client = ds1374_detach,
+};
+
+static int __init ds1374_init(void)
+{
+	return i2c_add_driver(&ds1374_driver);
+}
+
+static void __exit ds1374_exit(void)
+{
+	i2c_del_driver(&ds1374_driver);
+}
+
+module_init(ds1374_init);
+module_exit(ds1374_exit);
+
+MODULE_AUTHOR("Randy Vinson <rvinson@mvista.com>");
+MODULE_DESCRIPTION("Maxim/Dallas DS1374 RTC I2C Client Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/i2c-id.h b/include/linux/i2c-id.h
index 89270ce51470..33f08258f22b 100644
--- a/include/linux/i2c-id.h
+++ b/include/linux/i2c-id.h
@@ -108,6 +108,7 @@
 #define I2C_DRIVERID_TDA7313	62	/* TDA7313 audio processor	*/
 #define I2C_DRIVERID_MAX6900	63	/* MAX6900 real-time clock	*/
 #define I2C_DRIVERID_SAA7114H	64	/* video decoder		*/
+#define I2C_DRIVERID_DS1374	65	/* DS1374 real time clock	*/
 
 
 #define I2C_DRIVERID_EXP0	0xF0	/* experimental use id's	*/
-- 
cgit v1.2.3-59-g8ed1b


From dd7f0b80926befc8c70a873b5b0c0c7b5fd1e7b9 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Wed, 22 Jun 2005 12:38:33 -0700
Subject: [NETFILTER]: Fix "iptables -D" rule deletion with ipt_CLUSTERIP
 target.

The patch just changes the order of structure members.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ipt_CLUSTERIP.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
index baa83e757156..d9bceedfb3dc 100644
--- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
+++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h
@@ -18,7 +18,6 @@ struct clusterip_config;
 struct ipt_clusterip_tgt_info {
 
 	u_int32_t flags;
-	struct clusterip_config *config;
 	
 	/* only relevant for new ones */
 	u_int8_t clustermac[6];
@@ -27,6 +26,8 @@ struct ipt_clusterip_tgt_info {
 	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
 	enum clusterip_hashmode hash_mode;
 	u_int32_t hash_initval;
+
+	struct clusterip_config *config;
 };
 
 #endif /*_IPT_CLUSTERIP_H_target*/
-- 
cgit v1.2.3-59-g8ed1b


From 5ee0ed7d3ab620a764740fb018f469d45f561931 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:20 +0000
Subject: [PATCH] RPC: Make rpc_create_client() probe server for RPC
 program+version support

 Ensure that we don't create an RPC client without checking that the server
 does indeed support the RPC program + version that we are trying to set up.

 This enables us to immediately return an error to "mount" if it turns out
 that the server is only supporting NFSv2, when we requested NFSv3 or NFSv4.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/host.c             |  4 +--
 fs/lockd/mon.c              |  2 +-
 include/linux/sunrpc/clnt.h |  4 +++
 net/sunrpc/clnt.c           | 59 ++++++++++++++++++++++++++++++++++++++++++++-
 net/sunrpc/pmap_clnt.c      |  2 +-
 5 files changed, 66 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 90a62f27914c..82c77df81c5f 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -189,6 +189,8 @@ nlm_bind_host(struct nlm_host *host)
 			goto forgetit;
 
 		xprt_set_timeout(&xprt->timeout, 5, nlmsvc_timeout);
+		xprt->nocong = 1;	/* No congestion control for NLM */
+		xprt->resvport = 1;	/* NLM requires a reserved port */
 
 		/* Existing NLM servers accept AUTH_UNIX only */
 		clnt = rpc_create_client(xprt, host->h_name, &nlm_program,
@@ -196,8 +198,6 @@ nlm_bind_host(struct nlm_host *host)
 		if (IS_ERR(clnt))
 			goto forgetit;
 		clnt->cl_autobind = 1;	/* turn on pmap queries */
-		xprt->nocong = 1;	/* No congestion control for NLM */
-		xprt->resvport = 1;	/* NLM requires a reserved port */
 
 		host->h_rpcclnt = clnt;
 	}
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 81b5e7778d70..2d144abe84ad 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -115,6 +115,7 @@ nsm_create(void)
 	xprt = xprt_create_proto(IPPROTO_UDP, &sin, NULL);
 	if (IS_ERR(xprt))
 		return (struct rpc_clnt *)xprt;
+	xprt->resvport = 1;	/* NSM requires a reserved port */
 
 	clnt = rpc_create_client(xprt, "localhost",
 				&nsm_program, SM_VERSION,
@@ -124,7 +125,6 @@ nsm_create(void)
 	clnt->cl_softrtry = 1;
 	clnt->cl_chatty   = 1;
 	clnt->cl_oneshot  = 1;
-	xprt->resvport = 1;	/* NSM requires a reserved port */
 	return clnt;
 
 out_err:
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 2709caf4d128..d25e80f77ff5 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -111,6 +111,9 @@ struct rpc_procinfo {
 struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
 				struct rpc_program *info,
 				u32 version, rpc_authflavor_t authflavor);
+struct rpc_clnt *rpc_new_client(struct rpc_xprt *xprt, char *servname,
+				struct rpc_program *info,
+				u32 version, rpc_authflavor_t authflavor);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 int		rpc_shutdown_client(struct rpc_clnt *);
 int		rpc_destroy_client(struct rpc_clnt *);
@@ -129,6 +132,7 @@ void		rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset);
 void		rpc_setbufsize(struct rpc_clnt *, unsigned int, unsigned int);
 size_t		rpc_max_payload(struct rpc_clnt *);
+int		rpc_ping(struct rpc_clnt *clnt, int flags);
 
 static __inline__
 int rpc_call(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 99515d7727a6..b36797ad8083 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -97,7 +97,7 @@ rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name)
  * made to sleep too long.
  */
 struct rpc_clnt *
-rpc_create_client(struct rpc_xprt *xprt, char *servname,
+rpc_new_client(struct rpc_xprt *xprt, char *servname,
 		  struct rpc_program *program, u32 vers,
 		  rpc_authflavor_t flavor)
 {
@@ -182,6 +182,36 @@ out_err:
 	return ERR_PTR(err);
 }
 
+/**
+ * Create an RPC client
+ * @xprt - pointer to xprt struct
+ * @servname - name of server
+ * @info - rpc_program
+ * @version - rpc_program version
+ * @authflavor - rpc_auth flavour to use
+ *
+ * Creates an RPC client structure, then pings the server in order to
+ * determine if it is up, and if it supports this program and version.
+ *
+ * This function should never be called by asynchronous tasks such as
+ * the portmapper.
+ */
+struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
+		struct rpc_program *info, u32 version, rpc_authflavor_t authflavor)
+{
+	struct rpc_clnt *clnt;
+	int err;
+	
+	clnt = rpc_new_client(xprt, servname, info, version, authflavor);
+	if (IS_ERR(clnt))
+		return clnt;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err == 0)
+		return clnt;
+	rpc_shutdown_client(clnt);
+	return ERR_PTR(err);
+}
+
 /*
  * This function clones the RPC client structure. It allows us to share the
  * same transport while varying parameters such as the authentication
@@ -1086,3 +1116,30 @@ out_overflow:
 	printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__);
 	goto out_retry;
 }
+
+static int rpcproc_encode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static int rpcproc_decode_null(void *rqstp, u32 *data, void *obj)
+{
+	return 0;
+}
+
+static struct rpc_procinfo rpcproc_null = {
+	.p_encode = rpcproc_encode_null,
+	.p_decode = rpcproc_decode_null,
+};
+
+int rpc_ping(struct rpc_clnt *clnt, int flags)
+{
+	struct rpc_message msg = {
+		.rpc_proc = &rpcproc_null,
+	};
+	int err;
+	msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	err = rpc_call_sync(clnt, &msg, flags);
+	put_rpccred(msg.rpc_cred);
+	return err;
+}
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index 97c420ff1ee0..df4d84c9020d 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -207,7 +207,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto)
 	xprt->addr.sin_port = htons(RPC_PMAP_PORT);
 
 	/* printk("pmap: create clnt\n"); */
-	clnt = rpc_create_client(xprt, hostname,
+	clnt = rpc_new_client(xprt, hostname,
 				&pmap_program, RPC_PMAP_VERSION,
 				RPC_AUTH_UNIX);
 	if (!IS_ERR(clnt)) {
-- 
cgit v1.2.3-59-g8ed1b


From 4ce79717ce32a9f88c1ddce4b9658556cb59d37a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] NFS: Header file cleanup...

 - Move NFSv4 state definitions into a private header file.
 - Clean up gunk in nfs_fs.h

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback.c      |   1 +
 fs/nfs/callback_proc.c |   1 +
 fs/nfs/callback_xdr.c  |   1 +
 fs/nfs/delegation.c    |   1 +
 fs/nfs/dir.c           |   1 +
 fs/nfs/idmap.c         |   1 +
 fs/nfs/inode.c         |   1 +
 fs/nfs/nfs4_fs.h       | 250 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs4proc.c      |   5 +-
 fs/nfs/nfs4renewd.c    |   1 +
 fs/nfs/nfs4state.c     |  12 +--
 fs/nfs/nfs4xdr.c       |   8 +-
 include/linux/nfs_fs.h | 241 -----------------------------------------------
 13 files changed, 264 insertions(+), 260 deletions(-)
 create mode 100644 fs/nfs/nfs4_fs.h

(limited to 'include/linux')

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 560d6175dd58..f2ca782aba33 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
 #include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
 #include "callback.h"
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index ece27e42b93b..65f1e19e4d19 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -8,6 +8,7 @@
 #include <linux/config.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
 
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d271df9df2b2..c99677ec58f8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -10,6 +10,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
 #include "callback.h"
 
 #define CB_OP_TAGLEN_MAXSZ	(512)
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 5b9c60f97791..d7f7eb669d03 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -16,6 +16,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_xdr.h>
 
+#include "nfs4_fs.h"
 #include "delegation.h"
 
 static struct nfs_delegation *nfs_alloc_delegation(void)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ff6155f5e8d9..9ccb15e86967 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -32,6 +32,7 @@
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
 
+#include "nfs4_fs.h"
 #include "delegation.h"
 
 #define NFS_PARANOIA 1
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 87f4f9aeac86..ffb8df91dc34 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -50,6 +50,7 @@
 #include <linux/nfs_fs.h>
 
 #include <linux/nfs_idmap.h>
+#include "nfs4_fs.h"
 
 #define IDMAP_HASH_SZ          128
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 32ddcf69e9ac..c80a81ff59c6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,6 +39,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
+#include "nfs4_fs.h"
 #include "delegation.h"
 
 #define NFSDBG_FACILITY		NFSDBG_VFS
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
new file mode 100644
index 000000000000..85cf3bd36921
--- /dev/null
+++ b/fs/nfs/nfs4_fs.h
@@ -0,0 +1,250 @@
+/*
+ * linux/fs/nfs/nfs4_fs.h
+ *
+ * Copyright (C) 2005 Trond Myklebust
+ *
+ * NFSv4-specific filesystem definitions and declarations
+ */
+
+#ifndef __LINUX_FS_NFS_NFS4_FS_H
+#define __LINUX_FS_NFS_NFS4_FS_H
+
+#ifdef CONFIG_NFS_V4
+
+struct idmap;
+
+/*
+ * In a seqid-mutating op, this macro controls which error return
+ * values trigger incrementation of the seqid.
+ *
+ * from rfc 3010:
+ * The client MUST monotonically increment the sequence number for the
+ * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
+ * operations.  This is true even in the event that the previous
+ * operation that used the sequence number received an error.  The only
+ * exception to this rule is if the previous operation received one of
+ * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
+ * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
+ * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
+ *
+ */
+#define seqid_mutating_err(err)       \
+(((err) != NFSERR_STALE_CLIENTID) &&  \
+ ((err) != NFSERR_STALE_STATEID)  &&  \
+ ((err) != NFSERR_BAD_STATEID)    &&  \
+ ((err) != NFSERR_BAD_SEQID)      &&  \
+ ((err) != NFSERR_BAD_XDR)        &&  \
+ ((err) != NFSERR_RESOURCE)       &&  \
+ ((err) != NFSERR_NOFILEHANDLE))
+
+enum nfs4_client_state {
+	NFS4CLNT_OK  = 0,
+};
+
+/*
+ * The nfs4_client identifies our client state to the server.
+ */
+struct nfs4_client {
+	struct list_head	cl_servers;	/* Global list of servers */
+	struct in_addr		cl_addr;	/* Server identifier */
+	u64			cl_clientid;	/* constant */
+	nfs4_verifier		cl_confirm;
+	unsigned long		cl_state;
+
+	u32			cl_lockowner_id;
+
+	/*
+	 * The following rwsem ensures exclusive access to the server
+	 * while we recover the state following a lease expiration.
+	 */
+	struct rw_semaphore	cl_sem;
+
+	struct list_head	cl_delegations;
+	struct list_head	cl_state_owners;
+	struct list_head	cl_unused;
+	int			cl_nunused;
+	spinlock_t		cl_lock;
+	atomic_t		cl_count;
+
+	struct rpc_clnt *	cl_rpcclient;
+	struct rpc_cred *	cl_cred;
+
+	struct list_head	cl_superblocks;	/* List of nfs_server structs */
+
+	unsigned long		cl_lease_time;
+	unsigned long		cl_last_renewal;
+	struct work_struct	cl_renewd;
+	struct work_struct	cl_recoverd;
+
+	wait_queue_head_t	cl_waitq;
+	struct rpc_wait_queue	cl_rpcwaitq;
+
+	/* used for the setclientid verifier */
+	struct timespec		cl_boot_time;
+
+	/* idmapper */
+	struct idmap *		cl_idmap;
+
+	/* Our own IP address, as a null-terminated string.
+	 * This is used to generate the clientid, and the callback address.
+	 */
+	char			cl_ipaddr[16];
+	unsigned char		cl_id_uniquifier;
+};
+
+/*
+ * NFS4 state_owners and lock_owners are simply labels for ordered
+ * sequences of RPC calls. Their sole purpose is to provide once-only
+ * semantics by allowing the server to identify replayed requests.
+ *
+ * The ->so_sema is held during all state_owner seqid-mutating operations:
+ * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize
+ * so_seqid.
+ */
+struct nfs4_state_owner {
+	struct list_head     so_list;	 /* per-clientid list of state_owners */
+	struct nfs4_client   *so_client;
+	u32                  so_id;      /* 32-bit identifier, unique */
+	struct semaphore     so_sema;
+	u32                  so_seqid;   /* protected by so_sema */
+	atomic_t	     so_count;
+
+	struct rpc_cred	     *so_cred;	 /* Associated cred */
+	struct list_head     so_states;
+	struct list_head     so_delegations;
+};
+
+/*
+ * struct nfs4_state maintains the client-side state for a given
+ * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
+ *
+ * OPEN:
+ * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
+ * we need to know how many files are open for reading or writing on a
+ * given inode. This information too is stored here.
+ *
+ * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
+ */
+
+struct nfs4_lock_state {
+	struct list_head	ls_locks;	/* Other lock stateids */
+	fl_owner_t		ls_owner;	/* POSIX lock owner */
+#define NFS_LOCK_INITIALIZED 1
+	int			ls_flags;
+	u32			ls_seqid;
+	u32			ls_id;
+	nfs4_stateid		ls_stateid;
+	atomic_t		ls_count;
+};
+
+/* bits for nfs4_state->flags */
+enum {
+	LK_STATE_IN_USE,
+	NFS_DELEGATED_STATE,
+};
+
+struct nfs4_state {
+	struct list_head open_states;	/* List of states for the same state_owner */
+	struct list_head inode_states;	/* List of states for the same inode */
+	struct list_head lock_states;	/* List of subservient lock stateids */
+
+	struct nfs4_state_owner *owner;	/* Pointer to the open owner */
+	struct inode *inode;		/* Pointer to the inode */
+
+	unsigned long flags;		/* Do we hold any locks? */
+	struct semaphore lock_sema;	/* Serializes file locking operations */
+	rwlock_t state_lock;		/* Protects the lock_states list */
+
+	nfs4_stateid stateid;
+
+	unsigned int nreaders;
+	unsigned int nwriters;
+	int state;			/* State on the server (R,W, or RW) */
+	atomic_t count;
+};
+
+
+struct nfs4_exception {
+	long timeout;
+	int retry;
+};
+
+struct nfs4_state_recovery_ops {
+	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
+	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
+};
+
+extern struct dentry_operations nfs4_dentry_operations;
+extern struct inode_operations nfs4_dir_inode_operations;
+
+/* nfs4proc.c */
+extern int nfs4_map_errors(int err);
+extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
+extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
+extern int nfs4_proc_async_renew(struct nfs4_client *);
+extern int nfs4_proc_renew(struct nfs4_client *);
+extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
+extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
+extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
+
+extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
+extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
+
+extern const u32 nfs4_fattr_bitmap[2];
+extern const u32 nfs4_statfs_bitmap[2];
+extern const u32 nfs4_pathconf_bitmap[2];
+extern const u32 nfs4_fsinfo_bitmap[2];
+
+/* nfs4renewd.c */
+extern void nfs4_schedule_state_renewal(struct nfs4_client *);
+extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
+extern void nfs4_kill_renewd(struct nfs4_client *);
+extern void nfs4_renew_state(void *);
+
+/* nfs4state.c */
+extern void init_nfsv4_state(struct nfs_server *);
+extern void destroy_nfsv4_state(struct nfs_server *);
+extern struct nfs4_client *nfs4_get_client(struct in_addr *);
+extern void nfs4_put_client(struct nfs4_client *clp);
+extern int nfs4_init_client(struct nfs4_client *clp);
+extern struct nfs4_client *nfs4_find_client(struct in_addr *);
+extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
+
+extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
+extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_drop_state_owner(struct nfs4_state_owner *);
+extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
+extern void nfs4_put_open_state(struct nfs4_state *);
+extern void nfs4_close_state(struct nfs4_state *, mode_t);
+extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
+extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
+extern void nfs4_schedule_state_recovery(struct nfs4_client *);
+extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
+extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t);
+extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
+extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
+extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
+extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
+extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
+
+extern const nfs4_stateid zero_stateid;
+
+/* nfs4xdr.c */
+extern uint32_t *nfs4_decode_dirent(uint32_t *p, struct nfs_entry *entry, int plus);
+extern struct rpc_procinfo nfs4_procedures[];
+
+struct nfs4_mount_data;
+
+/* callback_xdr.c */
+extern struct svc_version nfs4_callback_version1;
+
+#else
+
+#define init_nfsv4_state(server)  do { } while (0)
+#define destroy_nfsv4_state(server)       do { } while (0)
+#define nfs4_put_state_owner(inode, owner) do { } while (0)
+#define nfs4_put_open_state(state) do { } while (0)
+#define nfs4_close_state(a, b) do { } while (0)
+
+#endif /* CONFIG_NFS_V4 */
+#endif /* __LINUX_FS_NFS_NFS4_FS.H */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1d5cb3e80c3e..a69c02b206c1 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -48,6 +48,7 @@
 #include <linux/smp_lock.h>
 #include <linux/namei.h>
 
+#include "nfs4_fs.h"
 #include "delegation.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
@@ -62,8 +63,6 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
 extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 
-extern nfs4_stateid zero_stateid;
-
 /* Prevent leaks of NFSv4 errors into userland */
 int nfs4_map_errors(int err)
 {
@@ -104,7 +103,7 @@ const u32 nfs4_statfs_bitmap[2] = {
 	| FATTR4_WORD1_SPACE_TOTAL
 };
 
-u32 nfs4_pathconf_bitmap[2] = {
+const u32 nfs4_pathconf_bitmap[2] = {
 	FATTR4_WORD0_MAXLINK
 	| FATTR4_WORD0_MAXNAME,
 	0
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 667e06f1c647..a3001628ad32 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -53,6 +53,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include "nfs4_fs.h"
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 231cebce3c87..17b187f2d776 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -46,24 +46,18 @@
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
 
+#include "nfs4_fs.h"
 #include "callback.h"
 #include "delegation.h"
 
 #define OPENOWNER_POOL_SIZE	8
 
-static DEFINE_SPINLOCK(state_spinlock);
-
-nfs4_stateid zero_stateid;
-
-#if 0
-nfs4_stateid one_stateid =
-	{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
-#endif
+const nfs4_stateid zero_stateid;
 
+static DEFINE_SPINLOCK(state_spinlock);
 static LIST_HEAD(nfs4_clientid_list);
 
 static void nfs4_recover_state(void *);
-extern void nfs4_renew_state(void *);
 
 void
 init_nfsv4_state(struct nfs_server *server)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 5f4de05763c9..e86406eff0eb 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -51,6 +51,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
 #include <linux/nfs_idmap.h>
+#include "nfs4_fs.h"
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -660,8 +661,6 @@ static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1
 
 static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
 {
-	extern u32 nfs4_fattr_bitmap[];
-
 	return encode_getattr_two(xdr,
 			bitmask[0] & nfs4_fattr_bitmap[0],
 			bitmask[1] & nfs4_fattr_bitmap[1]);
@@ -669,8 +668,6 @@ static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
 
 static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
 {
-	extern u32 nfs4_fsinfo_bitmap[];
-
 	return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
 			bitmask[1] & nfs4_fsinfo_bitmap[1]);
 }
@@ -969,7 +966,6 @@ static int encode_putrootfh(struct xdr_stream *xdr)
 
 static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
 {
-	extern nfs4_stateid zero_stateid;
 	nfs4_stateid stateid;
 	uint32_t *p;
 
@@ -1697,7 +1693,6 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, uint32_t *p, struct nfs4_fs
  */
 static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct nfs4_pathconf_arg *args)
 {
-	extern u32 nfs4_pathconf_bitmap[2];
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops = 2,
@@ -1718,7 +1713,6 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, uint32_t *p, const struct
  */
 static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, uint32_t *p, const struct nfs4_statfs_arg *args)
 {
-	extern u32 nfs4_statfs_bitmap[];
 	struct xdr_stream xdr;
 	struct compound_hdr hdr = {
 		.nops = 2,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index dbac7f363e5d..fb33e7655cfa 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/rwsem.h>
 #include <linux/wait.h>
-#include <linux/uio.h>
 
 #include <linux/nfs_fs_sb.h>
 
@@ -29,7 +28,6 @@
 #include <linux/nfs4.h>
 #include <linux/nfs_xdr.h>
 #include <linux/rwsem.h>
-#include <linux/workqueue.h>
 #include <linux/mempool.h>
 
 /*
@@ -43,13 +41,6 @@
 #define NFS_MAX_FILE_IO_BUFFER_SIZE	32768
 #define NFS_DEF_FILE_IO_BUFFER_SIZE	4096
 
-/*
- * The upper limit on timeouts for the exponential backoff algorithm.
- */
-#define NFS_WRITEBACK_DELAY		(5*HZ)
-#define NFS_WRITEBACK_LOCKDELAY		(60*HZ)
-#define NFS_COMMIT_DELAY		(5*HZ)
-
 /*
  * superblock magic number for NFS
  */
@@ -60,9 +51,6 @@
  */
 #define NFS_RPC_SWAPFLAGS		(RPC_TASK_SWAPPER|RPC_TASK_ROOTCREDS)
 
-#define NFS_RW_SYNC		0x0001	/* O_SYNC handling */
-#define NFS_RW_SWAP		0x0002	/* This is a swap request */
-
 /*
  * When flushing a cluster of dirty pages, there can be different
  * strategies:
@@ -434,11 +422,6 @@ static inline void nfs_writedata_free(struct nfs_write_data *p)
 	mempool_free(p, nfs_wdata_mempool);
 }
 
-/* Hack for future NFS swap support */
-#ifndef IS_SWAPFILE
-# define IS_SWAPFILE(inode)	(0)
-#endif
-
 /*
  * linux/fs/nfs/read.c
  */
@@ -515,230 +498,6 @@ extern void * nfs_root_data(void);
 
 #define NFS_JUKEBOX_RETRY_TIME (5 * HZ)
 
-#ifdef CONFIG_NFS_V4
-
-struct idmap;
-
-/*
- * In a seqid-mutating op, this macro controls which error return
- * values trigger incrementation of the seqid.
- *
- * from rfc 3010:
- * The client MUST monotonically increment the sequence number for the
- * CLOSE, LOCK, LOCKU, OPEN, OPEN_CONFIRM, and OPEN_DOWNGRADE
- * operations.  This is true even in the event that the previous
- * operation that used the sequence number received an error.  The only
- * exception to this rule is if the previous operation received one of
- * the following errors: NFSERR_STALE_CLIENTID, NFSERR_STALE_STATEID,
- * NFSERR_BAD_STATEID, NFSERR_BAD_SEQID, NFSERR_BADXDR,
- * NFSERR_RESOURCE, NFSERR_NOFILEHANDLE.
- *
- */
-#define seqid_mutating_err(err)       \
-(((err) != NFSERR_STALE_CLIENTID) &&  \
- ((err) != NFSERR_STALE_STATEID)  &&  \
- ((err) != NFSERR_BAD_STATEID)    &&  \
- ((err) != NFSERR_BAD_SEQID)      &&  \
- ((err) != NFSERR_BAD_XDR)        &&  \
- ((err) != NFSERR_RESOURCE)       &&  \
- ((err) != NFSERR_NOFILEHANDLE))
-
-enum nfs4_client_state {
-	NFS4CLNT_OK  = 0,
-};
-
-/*
- * The nfs4_client identifies our client state to the server.
- */
-struct nfs4_client {
-	struct list_head	cl_servers;	/* Global list of servers */
-	struct in_addr		cl_addr;	/* Server identifier */
-	u64			cl_clientid;	/* constant */
-	nfs4_verifier		cl_confirm;
-	unsigned long		cl_state;
-
-	u32			cl_lockowner_id;
-
-	/*
-	 * The following rwsem ensures exclusive access to the server
-	 * while we recover the state following a lease expiration.
-	 */
-	struct rw_semaphore	cl_sem;
-
-	struct list_head	cl_delegations;
-	struct list_head	cl_state_owners;
-	struct list_head	cl_unused;
-	int			cl_nunused;
-	spinlock_t		cl_lock;
-	atomic_t		cl_count;
-
-	struct rpc_clnt *	cl_rpcclient;
-	struct rpc_cred *	cl_cred;
-
-	struct list_head	cl_superblocks;	/* List of nfs_server structs */
-
-	unsigned long		cl_lease_time;
-	unsigned long		cl_last_renewal;
-	struct work_struct	cl_renewd;
-	struct work_struct	cl_recoverd;
-
-	wait_queue_head_t	cl_waitq;
-	struct rpc_wait_queue	cl_rpcwaitq;
-
-	/* used for the setclientid verifier */
-	struct timespec		cl_boot_time;
-
-	/* idmapper */
-	struct idmap *		cl_idmap;
-
-	/* Our own IP address, as a null-terminated string.
-	 * This is used to generate the clientid, and the callback address.
-	 */
-	char			cl_ipaddr[16];
-	unsigned char		cl_id_uniquifier;
-};
-
-/*
- * NFS4 state_owners and lock_owners are simply labels for ordered
- * sequences of RPC calls. Their sole purpose is to provide once-only
- * semantics by allowing the server to identify replayed requests.
- *
- * The ->so_sema is held during all state_owner seqid-mutating operations:
- * OPEN, OPEN_DOWNGRADE, and CLOSE. Its purpose is to properly serialize
- * so_seqid.
- */
-struct nfs4_state_owner {
-	struct list_head     so_list;	 /* per-clientid list of state_owners */
-	struct nfs4_client   *so_client;
-	u32                  so_id;      /* 32-bit identifier, unique */
-	struct semaphore     so_sema;
-	u32                  so_seqid;   /* protected by so_sema */
-	atomic_t	     so_count;
-
-	struct rpc_cred	     *so_cred;	 /* Associated cred */
-	struct list_head     so_states;
-	struct list_head     so_delegations;
-};
-
-/*
- * struct nfs4_state maintains the client-side state for a given
- * (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
- *
- * OPEN:
- * In order to know when to OPEN_DOWNGRADE or CLOSE the state on the server,
- * we need to know how many files are open for reading or writing on a
- * given inode. This information too is stored here.
- *
- * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN)
- */
-
-struct nfs4_lock_state {
-	struct list_head	ls_locks;	/* Other lock stateids */
-	fl_owner_t		ls_owner;	/* POSIX lock owner */
-#define NFS_LOCK_INITIALIZED 1
-	int			ls_flags;
-	u32			ls_seqid;
-	u32			ls_id;
-	nfs4_stateid		ls_stateid;
-	atomic_t		ls_count;
-};
-
-/* bits for nfs4_state->flags */
-enum {
-	LK_STATE_IN_USE,
-	NFS_DELEGATED_STATE,
-};
-
-struct nfs4_state {
-	struct list_head open_states;	/* List of states for the same state_owner */
-	struct list_head inode_states;	/* List of states for the same inode */
-	struct list_head lock_states;	/* List of subservient lock stateids */
-
-	struct nfs4_state_owner *owner;	/* Pointer to the open owner */
-	struct inode *inode;		/* Pointer to the inode */
-
-	unsigned long flags;		/* Do we hold any locks? */
-	struct semaphore lock_sema;	/* Serializes file locking operations */
-	rwlock_t state_lock;		/* Protects the lock_states list */
-
-	nfs4_stateid stateid;
-
-	unsigned int nreaders;
-	unsigned int nwriters;
-	int state;			/* State on the server (R,W, or RW) */
-	atomic_t count;
-};
-
-
-struct nfs4_exception {
-	long timeout;
-	int retry;
-};
-
-struct nfs4_state_recovery_ops {
-	int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
-	int (*recover_lock)(struct nfs4_state *, struct file_lock *);
-};
-
-extern struct dentry_operations nfs4_dentry_operations;
-extern struct inode_operations nfs4_dir_inode_operations;
-
-/* nfs4proc.c */
-extern int nfs4_map_errors(int err);
-extern int nfs4_proc_setclientid(struct nfs4_client *, u32, unsigned short);
-extern int nfs4_proc_setclientid_confirm(struct nfs4_client *);
-extern int nfs4_proc_async_renew(struct nfs4_client *);
-extern int nfs4_proc_renew(struct nfs4_client *);
-extern int nfs4_do_close(struct inode *inode, struct nfs4_state *state, mode_t mode);
-extern struct inode *nfs4_atomic_open(struct inode *, struct dentry *, struct nameidata *);
-extern int nfs4_open_revalidate(struct inode *, struct dentry *, int);
-
-extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
-extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
-
-/* nfs4renewd.c */
-extern void nfs4_schedule_state_renewal(struct nfs4_client *);
-extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
-extern void nfs4_kill_renewd(struct nfs4_client *);
-
-/* nfs4state.c */
-extern void init_nfsv4_state(struct nfs_server *);
-extern void destroy_nfsv4_state(struct nfs_server *);
-extern struct nfs4_client *nfs4_get_client(struct in_addr *);
-extern void nfs4_put_client(struct nfs4_client *clp);
-extern int nfs4_init_client(struct nfs4_client *clp);
-extern struct nfs4_client *nfs4_find_client(struct in_addr *);
-extern u32 nfs4_alloc_lockowner_id(struct nfs4_client *);
-
-extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
-extern void nfs4_put_state_owner(struct nfs4_state_owner *);
-extern void nfs4_drop_state_owner(struct nfs4_state_owner *);
-extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
-extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct nfs4_state *, mode_t);
-extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
-extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
-extern void nfs4_schedule_state_recovery(struct nfs4_client *);
-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
-extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t);
-extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
-extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
-extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
-extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
-extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
-
-
-
-struct nfs4_mount_data;
-#else
-#define init_nfsv4_state(server)  do { } while (0)
-#define destroy_nfsv4_state(server)       do { } while (0)
-#define nfs4_put_state_owner(inode, owner) do { } while (0)
-#define nfs4_put_open_state(state) do { } while (0)
-#define nfs4_close_state(a, b) do { } while (0)
-#define nfs4_renewd_prepare_shutdown(server) do { } while (0)
-#endif
-
 #endif /* __KERNEL__ */
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From a656db998785324a818005bcf71bae6dcbbb3cf5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] NFS: Remove unused NFS inode field readdir_timestamp.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 8 +++-----
 include/linux/nfs_fs.h | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 9ccb15e86967..dffa21abd3ea 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -165,12 +165,10 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME;
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
-	 *	 throught inode->i_sem or some other mechanism.
+	 *	 through inode->i_sem or some other mechanism.
 	 */
-	if (page->index == 0) {
-		invalidate_inode_pages(inode->i_mapping);
-		NFS_I(inode)->readdir_timestamp = timestamp;
-	}
+	if (page->index == 0)
+		invalidate_inode_pages2_range(inode->i_mapping, PAGE_CACHE_SIZE, -1);
 	unlock_page(page);
 	return 0;
  error:
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index fb33e7655cfa..68d5aae89972 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -128,7 +128,6 @@ struct nfs_inode {
 	 *
 	 *	mtime != read_cache_mtime
 	 */
-	unsigned long		readdir_timestamp;
 	unsigned long		read_cache_jiffies;
 	unsigned long		attrtimeo;
 	unsigned long		attrtimeo_timestamp;
-- 
cgit v1.2.3-59-g8ed1b


From 96651ab341cde0fee940ec837f323d711cbfa7d5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] RPC: Shrink struct rpc_task by switching to wait_on_bit()

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/sched.h |  1 -
 net/sunrpc/sched.c           | 31 ++++++++++++++++++-------------
 2 files changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 99d17ed7cebb..4d77e90d0b30 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -31,7 +31,6 @@ struct rpc_wait_queue;
 struct rpc_wait {
 	struct list_head	list;		/* wait queue links */
 	struct list_head	links;		/* Links to related tasks */
-	wait_queue_head_t	waitq;		/* sync: sleep on this q */
 	struct rpc_wait_queue *	rpc_waitq;	/* RPC wait queue we're on */
 };
 
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cc298fa4b81d..2d9eb7fbd521 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -290,7 +290,7 @@ static void rpc_make_runnable(struct rpc_task *task)
 			return;
 		}
 	} else
-		wake_up(&task->u.tk_wait.waitq);
+		wake_up_bit(&task->tk_runstate, RPC_TASK_QUEUED);
 }
 
 /*
@@ -578,6 +578,14 @@ static inline int __rpc_do_exit(struct rpc_task *task)
 	return 1;
 }
 
+static int rpc_wait_bit_interruptible(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /*
  * This is the RPC `scheduler' (or rather, the finite state machine).
  */
@@ -648,22 +656,21 @@ static int __rpc_execute(struct rpc_task *task)
 
 		/* sync task: sleep here */
 		dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid);
-		if (RPC_TASK_UNINTERRUPTIBLE(task)) {
-			__wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task));
-		} else {
-			__wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status);
+		/* Note: Caller should be using rpc_clnt_sigmask() */
+		status = out_of_line_wait_on_bit(&task->tk_runstate,
+				RPC_TASK_QUEUED, rpc_wait_bit_interruptible,
+				TASK_INTERRUPTIBLE);
+		if (status == -ERESTARTSYS) {
 			/*
 			 * When a sync task receives a signal, it exits with
 			 * -ERESTARTSYS. In order to catch any callbacks that
 			 * clean up after sleeping on some queue, we don't
 			 * break the loop here, but go around once more.
 			 */
-			if (status == -ERESTARTSYS) {
-				dprintk("RPC: %4d got signal\n", task->tk_pid);
-				task->tk_flags |= RPC_TASK_KILLED;
-				rpc_exit(task, -ERESTARTSYS);
-				rpc_wake_up_task(task);
-			}
+			dprintk("RPC: %4d got signal\n", task->tk_pid);
+			task->tk_flags |= RPC_TASK_KILLED;
+			rpc_exit(task, -ERESTARTSYS);
+			rpc_wake_up_task(task);
 		}
 		rpc_set_running(task);
 		dprintk("RPC: %4d sync task resuming\n", task->tk_pid);
@@ -766,8 +773,6 @@ void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action call
 
 	/* Initialize workqueue for async tasks */
 	task->tk_workqueue = rpciod_workqueue;
-	if (!RPC_IS_ASYNC(task))
-		init_waitqueue_head(&task->u.tk_wait.waitq);
 
 	if (clnt) {
 		atomic_inc(&clnt->cl_users);
-- 
cgit v1.2.3-59-g8ed1b


From 464a98bd70bae8c559cfc82af799faf44824ce64 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:21 +0000
Subject: [PATCH] NFS: cleanup: shrink struct nfs_open_context

 Remove the wait queue, and replace the functions that depended on it
 with wait_on_bit().

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 -
 fs/nfs/pagelist.c      | 35 ++++++++++++++++++++++++++++-------
 include/linux/nfs_fs.h |  1 -
 3 files changed, 28 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c80a81ff59c6..a38d4b22d1f8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -848,7 +848,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rp
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
 		ctx->error = 0;
-		init_waitqueue_head(&ctx->waitq);
 	}
 	return ctx;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4f1ba723848d..80777f99a58a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -107,7 +107,7 @@ void nfs_unlock_request(struct nfs_page *req)
 	smp_mb__before_clear_bit();
 	clear_bit(PG_BUSY, &req->wb_flags);
 	smp_mb__after_clear_bit();
-	wake_up_all(&req->wb_context->waitq);
+	wake_up_bit(&req->wb_flags, PG_BUSY);
 	nfs_release_request(req);
 }
 
@@ -180,6 +180,17 @@ nfs_list_add_request(struct nfs_page *req, struct list_head *head)
 	req->wb_list_head = head;
 }
 
+static int nfs_wait_bit_interruptible(void *word)
+{
+	int ret = 0;
+
+	if (signal_pending(current))
+		ret = -ERESTARTSYS;
+	else
+		schedule();
+	return ret;
+}
+
 /**
  * nfs_wait_on_request - Wait for a request to complete.
  * @req: request to wait upon.
@@ -190,12 +201,22 @@ nfs_list_add_request(struct nfs_page *req, struct list_head *head)
 int
 nfs_wait_on_request(struct nfs_page *req)
 {
-	struct inode	*inode = req->wb_context->dentry->d_inode;
-        struct rpc_clnt	*clnt = NFS_CLIENT(inode);
-
-	if (!NFS_WBACK_BUSY(req))
-		return 0;
-	return nfs_wait_event(clnt, req->wb_context->waitq, !NFS_WBACK_BUSY(req));
+        struct rpc_clnt	*clnt = NFS_CLIENT(req->wb_context->dentry->d_inode);
+	sigset_t oldmask;
+	int ret = 0;
+
+	if (!test_bit(PG_BUSY, &req->wb_flags))
+		goto out;
+	/*
+	 * Note: the call to rpc_clnt_sigmask() suffices to ensure that we
+	 *	 are not interrupted if intr flag is not set
+	 */
+	rpc_clnt_sigmask(clnt, &oldmask);
+	ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY,
+			nfs_wait_bit_interruptible, TASK_INTERRUPTIBLE);
+	rpc_clnt_sigunmask(clnt, &oldmask);
+out:
+	return ret;
 }
 
 /**
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 68d5aae89972..0b01b96337f8 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -84,7 +84,6 @@ struct nfs_open_context {
 	int error;
 
 	struct list_head list;
-	wait_queue_head_t waitq;
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 92cfc62cb8412c9563860b1bf70cd4701f03092e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:22 +0000
Subject: [PATCH] NFS: Allow NFS versions to support different sets of inode
 operations.

 ACL support will require supporting additional inode operations in v4
 (getxattr, setxattr, listxattr).  This patch allows different protocol versions
 to support different inode operations by adding a file_inode_ops to the
 nfs_rpc_ops (to match the existing dir_inode_ops).

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 2 +-
 fs/nfs/nfs3proc.c       | 1 +
 fs/nfs/nfs4proc.c       | 1 +
 fs/nfs/proc.c           | 1 +
 include/linux/nfs_xdr.h | 1 +
 5 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a38d4b22d1f8..a82f0340744f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -686,7 +686,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 		/* Why so? Because we want revalidate for devices/FIFOs, and
 		 * that's precisely what we have in nfs_file_inode_operations.
 		 */
-		inode->i_op = &nfs_file_inode_operations;
+		inode->i_op = NFS_SB(sb)->rpc_ops->file_inode_ops;
 		if (S_ISREG(inode->i_mode)) {
 			inode->i_fop = &nfs_file_operations;
 			inode->i_data.a_ops = &nfs_file_aops;
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 3878494dfc2c..53953a775714 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -826,6 +826,7 @@ struct nfs_rpc_ops	nfs_v3_clientops = {
 	.version	= 3,			/* protocol version */
 	.dentry_ops	= &nfs_dentry_operations,
 	.dir_inode_ops	= &nfs_dir_inode_operations,
+	.file_inode_ops	= &nfs_file_inode_operations,
 	.getroot	= nfs3_proc_get_root,
 	.getattr	= nfs3_proc_getattr,
 	.setattr	= nfs3_proc_setattr,
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a69c02b206c1..a5a8cb3159a0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2746,6 +2746,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.version	= 4,			/* protocol version */
 	.dentry_ops	= &nfs4_dentry_operations,
 	.dir_inode_ops	= &nfs4_dir_inode_operations,
+	.file_inode_ops	= &nfs_file_inode_operations,
 	.getroot	= nfs4_proc_get_root,
 	.getattr	= nfs4_proc_getattr,
 	.setattr	= nfs4_proc_setattr,
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index d31b4d6e5a5e..cedf636bcf3c 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -622,6 +622,7 @@ struct nfs_rpc_ops	nfs_v2_clientops = {
 	.version	= 2,		       /* protocol version */
 	.dentry_ops	= &nfs_dentry_operations,
 	.dir_inode_ops	= &nfs_dir_inode_operations,
+	.file_inode_ops	= &nfs_file_inode_operations,
 	.getroot	= nfs_proc_get_root,
 	.getattr	= nfs_proc_getattr,
 	.setattr	= nfs_proc_setattr,
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 47037d9521cb..5b45bafd9db5 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -667,6 +667,7 @@ struct nfs_rpc_ops {
 	int	version;		/* Protocol version */
 	struct dentry_operations *dentry_ops;
 	struct inode_operations *dir_inode_ops;
+	struct inode_operations *file_inode_ops;
 
 	int	(*getroot) (struct nfs_server *, struct nfs_fh *,
 			    struct nfs_fsinfo *);
-- 
cgit v1.2.3-59-g8ed1b


From ada70d9425bcc5e376fef8591e4e76e204c0834c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:22 +0000
Subject: [PATCH] NFS: Add hooks to allow common NFS attribute code to clear
 cached acls

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c          | 33 ++++++++++++++++++++++++++-------
 include/linux/nfs_fs.h  |  1 +
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a82f0340744f..c45bd52cc1d7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -64,6 +64,7 @@ static void nfs_clear_inode(struct inode *);
 static void nfs_umount_begin(struct super_block *);
 static int  nfs_statfs(struct super_block *, struct kstatfs *);
 static int  nfs_show_options(struct seq_file *, struct vfsmount *);
+static void nfs_zap_acl_cache(struct inode *);
 
 static struct rpc_program	nfs_program;
 
@@ -153,6 +154,7 @@ nfs_clear_inode(struct inode *inode)
 
 	nfs_wb_all(inode);
 	BUG_ON (!list_empty(&nfsi->open_files));
+	nfs_zap_acl_cache(inode);
 	cred = nfsi->cache_access.cred;
 	if (cred)
 		put_rpccred(cred);
@@ -587,9 +589,19 @@ nfs_zap_caches(struct inode *inode)
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 	else
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+}
+
+static void nfs_zap_acl_cache(struct inode *inode)
+{
+	void (*clear_acl_cache)(struct inode *);
+
+	clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
+	if (clear_acl_cache != NULL)
+		clear_acl_cache(inode);
+	NFS_I(inode)->flags &= ~NFS_INO_INVALID_ACL;
 }
 
 /*
@@ -789,7 +801,7 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
-		NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS;
+		NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 	nfs_end_data_update(inode);
 	unlock_kernel();
 	return error;
@@ -1033,6 +1045,8 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		/* This ensures we revalidate dentries */
 		nfsi->cache_change_attribute++;
 	}
+	if (flags & NFS_INO_INVALID_ACL)
+		nfs_zap_acl_cache(inode);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
@@ -1183,7 +1197,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
 			|| inode->i_uid != fattr->uid
 			|| inode->i_gid != fattr->gid)
-		nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+		nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
 	if (inode->i_nlink != fattr->nlink)
@@ -1292,16 +1306,21 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 #endif
 		nfsi->change_attr = fattr->change_attr;
 		if (!data_unstable)
-			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS;
+			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 	}
 
-	memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+	/* If ctime has changed we should definitely clear access+acl caches */
+	if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
+		if (!data_unstable)
+			invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+	}
 	memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
 
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
 	    inode->i_uid != fattr->uid ||
 	    inode->i_gid != fattr->gid)
-		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS;
+		invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 
 	inode->i_mode = fattr->mode;
 	inode->i_nlink = fattr->nlink;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 0b01b96337f8..140bdf489f71 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -189,6 +189,7 @@ struct nfs_inode {
 #define NFS_INO_INVALID_DATA	0x0010		/* cached data is invalid */
 #define NFS_INO_INVALID_ATIME	0x0020		/* cached atime is invalid */
 #define NFS_INO_INVALID_ACCESS	0x0040		/* cached access cred invalid */
+#define NFS_INO_INVALID_ACL	0x0080		/* cached acls are invalid */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 5b45bafd9db5..cf38db59f347 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -714,6 +714,7 @@ struct nfs_rpc_ops {
 	int	(*file_open)   (struct inode *, struct file *);
 	int	(*file_release) (struct inode *, struct file *);
 	int	(*lock)(struct file *, int, struct file_lock *);
+	void	(*clear_acl_cache)(struct inode *);
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 029d105e66e5a90850d5a09dad76815d0bcfcaa3 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:22 +0000
Subject: [PATCH] NFSv4: Client-side xdr for reading NFSv4 acls

 Client-side support for NFSv4 acls: xdr encoding and decoding routines for
 reading acls

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c        | 100 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs4.h    |   1 +
 include/linux/nfs_xdr.h |   7 ++++
 3 files changed, 108 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 8204926bb467..6f1c003ee33a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -365,6 +365,13 @@ static int nfs_stat_to_errno(int);
 				encode_delegreturn_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
 				decode_delegreturn_maxsz)
+#define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
+				encode_putfh_maxsz + \
+				encode_getattr_maxsz)
+#define NFS4_dec_getacl_sz	(compound_decode_hdr_maxsz + \
+				decode_putfh_maxsz + \
+				op_decode_hdr_maxsz + \
+				nfs4_fattr_bitmap_maxsz + 1)
 
 static struct {
 	unsigned int	mode;
@@ -1631,6 +1638,34 @@ out:
         return status;
 }
 
+/*
+ * Encode a GETACL request
+ */
+static int
+nfs4_xdr_enc_getacl(struct rpc_rqst *req, uint32_t *p,
+		struct nfs_getaclargs *args)
+{
+	struct xdr_stream xdr;
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	struct compound_hdr hdr = {
+		.nops   = 2,
+	};
+	int replen, status;
+
+	xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+	encode_compound_hdr(&xdr, &hdr);
+	status = encode_putfh(&xdr, args->fh);
+	if (status)
+		goto out;
+	status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
+	/* set up reply buffer: */
+	replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
+	xdr_inline_pages(&req->rq_rcv_buf, replen,
+		args->acl_pages, args->acl_pgbase, args->acl_len);
+out:
+	return status;
+}
+
 /*
  * Encode a WRITE request
  */
@@ -3125,6 +3160,47 @@ static int decode_renew(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_RENEW);
 }
 
+static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
+		size_t *acl_len)
+{
+	uint32_t *savep;
+	uint32_t attrlen,
+		 bitmap[2] = {0};
+	struct kvec *iov = req->rq_rcv_buf.head;
+	int status;
+
+	*acl_len = 0;
+	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+		goto out;
+	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+		goto out;
+	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+		goto out;
+
+	if (unlikely(bitmap[0] & (FATTR4_WORD0_ACL - 1U)))
+		return -EIO;
+	if (likely(bitmap[0] & FATTR4_WORD0_ACL)) {
+		int hdrlen, recvd;
+
+		/* We ignore &savep and don't do consistency checks on
+		 * the attr length.  Let userspace figure it out.... */
+		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+		recvd = req->rq_rcv_buf.len - hdrlen;
+		if (attrlen > recvd) {
+			printk(KERN_WARNING "NFS: server cheating in getattr"
+					" acl reply: attrlen %u > recvd %u\n",
+					attrlen, recvd);
+			return -EINVAL;
+		}
+		if (attrlen <= *acl_len)
+			xdr_read_pages(xdr, attrlen);
+		*acl_len = attrlen;
+	}
+
+out:
+	return status;
+}
+
 static int
 decode_savefh(struct xdr_stream *xdr)
 {
@@ -3417,6 +3493,29 @@ out:
 }
 
 
+/*
+ * Decode GETACL response
+ */
+static int
+nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, uint32_t *p, size_t *acl_len)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status)
+		goto out;
+	status = decode_getacl(&xdr, rqstp, acl_len);
+
+out:
+	return status;
+}
+
 /*
  * Decode CLOSE response
  */
@@ -4017,6 +4116,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(READDIR,		enc_readdir,	dec_readdir),
   PROC(SERVER_CAPS,	enc_server_caps, dec_server_caps),
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
+  PROC(GETACL,		enc_getacl,	dec_getacl),
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 5ca8a8d8ccdf..6ee7e2585af5 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -382,6 +382,7 @@ enum {
 	NFSPROC4_CLNT_READDIR,
 	NFSPROC4_CLNT_SERVER_CAPS,
 	NFSPROC4_CLNT_DELEGRETURN,
+	NFSPROC4_CLNT_GETACL,
 };
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index cf38db59f347..9f5e1d407c7b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -326,6 +326,13 @@ struct nfs_setattrargs {
 	const u32 *			bitmask;
 };
 
+struct nfs_getaclargs {
+	struct nfs_fh *			fh;
+	size_t				acl_len;
+	unsigned int			acl_pgbase;
+	struct page **			acl_pages;
+};
+
 struct nfs_setattrres {
 	struct nfs_fattr *              fattr;
 	const struct nfs_server *	server;
-- 
cgit v1.2.3-59-g8ed1b


From 23ec6965c20db96bc8ea7af0ec178f074dd31c40 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:22 +0000
Subject: [PATCH] NFSv4: Client-side xdr for writing NFSv4 acls

 Client-side support for NFSv4 acls: xdr encoding and decoding routines for
 writing acls

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c        | 71 ++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nfs4.h    |  1 +
 include/linux/nfs_xdr.h |  7 +++++
 3 files changed, 78 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 6f1c003ee33a..325cd6d4f23a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -372,6 +372,13 @@ static int nfs_stat_to_errno(int);
 				decode_putfh_maxsz + \
 				op_decode_hdr_maxsz + \
 				nfs4_fattr_bitmap_maxsz + 1)
+#define NFS4_enc_setacl_sz	(compound_encode_hdr_maxsz + \
+				encode_putfh_maxsz + \
+				op_encode_hdr_maxsz + 4 + \
+				nfs4_fattr_bitmap_maxsz + 1)
+#define NFS4_dec_setacl_sz	(compound_decode_hdr_maxsz + \
+				decode_putfh_maxsz + \
+				op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz)
 
 static struct {
 	unsigned int	mode;
@@ -471,7 +478,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
 	 * In the worst-case, this would be
 	 *   12(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
 	 *          = 36 bytes, plus any contribution from variable-length fields
-	 *            such as owner/group/acl's.
+	 *            such as owner/group.
 	 */
 	len = 16;
 
@@ -1095,6 +1102,25 @@ static int encode_renew(struct xdr_stream *xdr, const struct nfs4_client *client
 	return 0;
 }
 
+static int
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
+{
+	uint32_t *p;
+
+	RESERVE_SPACE(4+sizeof(zero_stateid.data));
+	WRITE32(OP_SETATTR);
+	WRITEMEM(zero_stateid.data, sizeof(zero_stateid.data));
+	RESERVE_SPACE(2*4);
+	WRITE32(1);
+	WRITE32(FATTR4_WORD0_ACL);
+	if (arg->acl_len % 4)
+		return -EINVAL;
+	RESERVE_SPACE(4);
+	WRITE32(arg->acl_len);
+	xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+	return 0;
+}
+
 static int
 encode_savefh(struct xdr_stream *xdr)
 {
@@ -3492,6 +3518,48 @@ out:
 
 }
 
+/*
+ * Encode an SETACL request
+ */
+static int
+nfs4_xdr_enc_setacl(struct rpc_rqst *req, uint32_t *p, struct nfs_setaclargs *args)
+{
+        struct xdr_stream xdr;
+        struct compound_hdr hdr = {
+                .nops   = 2,
+        };
+        int status;
+
+        xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+        encode_compound_hdr(&xdr, &hdr);
+        status = encode_putfh(&xdr, args->fh);
+        if (status)
+                goto out;
+        status = encode_setacl(&xdr, args);
+out:
+        return status;
+}
+/*
+ * Decode SETACL response
+ */
+static int
+nfs4_xdr_dec_setacl(struct rpc_rqst *rqstp, uint32_t *p, void *res)
+{
+	struct xdr_stream xdr;
+	struct compound_hdr hdr;
+	int status;
+
+	xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+	status = decode_compound_hdr(&xdr, &hdr);
+	if (status)
+		goto out;
+	status = decode_putfh(&xdr);
+	if (status)
+		goto out;
+	status = decode_setattr(&xdr, res);
+out:
+	return status;
+}
 
 /*
  * Decode GETACL response
@@ -4117,6 +4185,7 @@ struct rpc_procinfo	nfs4_procedures[] = {
   PROC(SERVER_CAPS,	enc_server_caps, dec_server_caps),
   PROC(DELEGRETURN,	enc_delegreturn, dec_delegreturn),
   PROC(GETACL,		enc_getacl,	dec_getacl),
+  PROC(SETACL,		enc_setacl,	dec_setacl),
 };
 
 struct rpc_version		nfs_version4 = {
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 6ee7e2585af5..5bb5b2fd7ba2 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -383,6 +383,7 @@ enum {
 	NFSPROC4_CLNT_SERVER_CAPS,
 	NFSPROC4_CLNT_DELEGRETURN,
 	NFSPROC4_CLNT_GETACL,
+	NFSPROC4_CLNT_SETACL,
 };
 
 #endif
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 9f5e1d407c7b..46b206b460c0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -326,6 +326,13 @@ struct nfs_setattrargs {
 	const u32 *			bitmask;
 };
 
+struct nfs_setaclargs {
+	struct nfs_fh *			fh;
+	size_t				acl_len;
+	unsigned int			acl_pgbase;
+	struct page **			acl_pages;
+};
+
 struct nfs_getaclargs {
 	struct nfs_fh *			fh;
 	size_t				acl_len;
-- 
cgit v1.2.3-59-g8ed1b


From e50a1c2e1f816c81eed6a589019052cb44189267 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] NFSv4: client-side caching NFSv4 ACLs

 Add nfs4_acl field to the nfs_inode, and use it to cache acls.  Only cache
 acls of size up to a page.  Also prepare for up to a page of acl data even
 when the user doesn't pass in a buffer, as when they want to get the acl
 length to decide what size buffer to allocate.

 Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |   7 ++-
 fs/nfs/nfs4proc.c      | 129 +++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/nfs_fs.h |   2 +-
 3 files changed, 124 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index c45bd52cc1d7..350c48c12639 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -142,10 +142,6 @@ nfs_delete_inode(struct inode * inode)
 	clear_inode(inode);
 }
 
-/*
- * For the moment, the only task for the NFS clear_inode method is to
- * release the mmap credential
- */
 static void
 nfs_clear_inode(struct inode *inode)
 {
@@ -1923,6 +1919,9 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
 	if (!nfsi)
 		return NULL;
 	nfsi->flags = 0;
+#ifdef CONFIG_NFS_V4
+	nfsi->nfs4_acl = NULL;
+#endif /* CONFIG_NFS_V4 */
 	return &nfsi->vfs_inode;
 }
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d969dd13e7db..128d01cfea19 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2188,9 +2188,75 @@ static void buf_to_pages(const void *buf, size_t buflen,
 	}
 }
 
-static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+struct nfs4_cached_acl {
+	int cached;
+	size_t len;
+	char data[];
+};
+
+static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	spin_lock(&inode->i_lock);
+	kfree(nfsi->nfs4_acl);
+	nfsi->nfs4_acl = acl;
+	spin_unlock(&inode->i_lock);
+}
+
+static void nfs4_zap_acl_attr(struct inode *inode)
+{
+	nfs4_set_cached_acl(inode, NULL);
+}
+
+static inline ssize_t nfs4_read_cached_acl(struct inode *inode, char *buf, size_t buflen)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs4_cached_acl *acl;
+	int ret = -ENOENT;
+
+	spin_lock(&inode->i_lock);
+	acl = nfsi->nfs4_acl;
+	if (acl == NULL)
+		goto out;
+	if (buf == NULL) /* user is just asking for length */
+		goto out_len;
+	if (acl->cached == 0)
+		goto out;
+	ret = -ERANGE; /* see getxattr(2) man page */
+	if (acl->len > buflen)
+		goto out;
+	memcpy(buf, acl->data, acl->len);
+out_len:
+	ret = acl->len;
+out:
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+static void nfs4_write_cached_acl(struct inode *inode, const char *buf, size_t acl_len)
+{
+	struct nfs4_cached_acl *acl;
+
+	if (buf && acl_len <= PAGE_SIZE) {
+		acl = kmalloc(sizeof(*acl) + acl_len, GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 1;
+		memcpy(acl->data, buf, acl_len);
+	} else {
+		acl = kmalloc(sizeof(*acl), GFP_KERNEL);
+		if (acl == NULL)
+			goto out;
+		acl->cached = 0;
+	}
+	acl->len = acl_len;
+out:
+	nfs4_set_cached_acl(inode, acl);
+}
+
+static inline ssize_t nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
 {
-	struct nfs_server *server = NFS_SERVER(inode);
 	struct page *pages[NFS4ACL_MAXPAGES];
 	struct nfs_getaclargs args = {
 		.fh = NFS_FH(inode),
@@ -2198,24 +2264,66 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 		.acl_len = buflen,
 	};
 	size_t resp_len = buflen;
+	void *resp_buf;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETACL],
 		.rpc_argp = &args,
 		.rpc_resp = &resp_len,
 	};
+	struct page *localpage = NULL;
 	int ret;
 
-	if (!nfs4_server_supports_acls(server))
-		return -EOPNOTSUPP;
-	buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+	if (buflen < PAGE_SIZE) {
+		/* As long as we're doing a round trip to the server anyway,
+		 * let's be prepared for a page of acl data. */
+		localpage = alloc_page(GFP_KERNEL);
+		resp_buf = page_address(localpage);
+		if (localpage == NULL)
+			return -ENOMEM;
+		args.acl_pages[0] = localpage;
+		args.acl_pgbase = 0;
+		args.acl_len = PAGE_SIZE;
+	} else {
+		resp_buf = buf;
+		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+	}
 	ret = rpc_call_sync(NFS_CLIENT(inode), &msg, 0);
-	if (buflen && resp_len > buflen)
-		return -ERANGE;
-	if (ret == 0)
-		ret = resp_len;
+	if (ret)
+		goto out_free;
+	if (resp_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, resp_len);
+	else
+		nfs4_write_cached_acl(inode, resp_buf, resp_len);
+	if (buf) {
+		ret = -ERANGE;
+		if (resp_len > buflen)
+			goto out_free;
+		if (localpage)
+			memcpy(buf, resp_buf, resp_len);
+	}
+	ret = resp_len;
+out_free:
+	if (localpage)
+		__free_page(localpage);
 	return ret;
 }
 
+static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
+
+	if (!nfs4_server_supports_acls(server))
+		return -EOPNOTSUPP;
+	ret = nfs_revalidate_inode(server, inode);
+	if (ret < 0)
+		return ret;
+	ret = nfs4_read_cached_acl(inode, buf, buflen);
+	if (ret != -ENOENT)
+		return ret;
+	return nfs4_get_acl_uncached(inode, buf, buflen);
+}
+
 static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -2236,6 +2344,8 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
 		return -EOPNOTSUPP;
 	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = rpc_call_sync(NFS_SERVER(inode)->client, &msg, 0);
+	if (ret == 0)
+		nfs4_write_cached_acl(inode, buf, buflen);
 	return ret;
 }
 
@@ -2907,6 +3017,7 @@ struct nfs_rpc_ops	nfs_v4_clientops = {
 	.file_open      = nfs4_proc_file_open,
 	.file_release   = nfs4_proc_file_release,
 	.lock		= nfs4_proc_lock,
+	.clear_acl_cache = nfs4_zap_acl_attr,
 };
 
 /*
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 140bdf489f71..d2b5d7e0e85a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -169,13 +169,13 @@ struct nfs_inode {
 	wait_queue_head_t	nfs_i_wait;
 
 #ifdef CONFIG_NFS_V4
+	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
 	struct list_head	open_states;
 	struct nfs_delegation	*delegation;
 	int			 delegation_state;
 	struct rw_semaphore	rwsem;
 #endif /* CONFIG_NFS_V4*/
-
 	struct inode		vfs_inode;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 007e251f2b2760f738c92adc8c80cbae0bed3ce5 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:23 +0000
Subject: [PATCH] RPC: Allow multiple RPC client programs to share the same
 transport

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h |  2 ++
 net/sunrpc/clnt.c           | 40 ++++++++++++++++++++++++++++++++++++++++
 net/sunrpc/pmap_clnt.c      |  3 +++
 net/sunrpc/sunrpc_syms.c    |  1 +
 4 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index d25e80f77ff5..ab151bbb66df 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -114,6 +114,8 @@ struct rpc_clnt *rpc_create_client(struct rpc_xprt *xprt, char *servname,
 struct rpc_clnt *rpc_new_client(struct rpc_xprt *xprt, char *servname,
 				struct rpc_program *info,
 				u32 version, rpc_authflavor_t authflavor);
+struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
+				struct rpc_program *, int);
 struct rpc_clnt *rpc_clone_client(struct rpc_clnt *);
 int		rpc_shutdown_client(struct rpc_clnt *);
 int		rpc_destroy_client(struct rpc_clnt *);
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 33f12b84e265..c979fcf88798 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -241,6 +241,8 @@ rpc_clone_client(struct rpc_clnt *clnt)
 	rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval);
 	if (new->cl_auth)
 		atomic_inc(&new->cl_auth->au_count);
+	new->cl_pmap		= &new->cl_pmap_default;
+	rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait");
 	return new;
 out_no_clnt:
 	printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__);
@@ -329,6 +331,44 @@ rpc_release_client(struct rpc_clnt *clnt)
 		rpc_destroy_client(clnt);
 }
 
+/**
+ * rpc_bind_new_program - bind a new RPC program to an existing client
+ * @old - old rpc_client
+ * @program - rpc program to set
+ * @vers - rpc program version
+ *
+ * Clones the rpc client and sets up a new RPC program. This is mainly
+ * of use for enabling different RPC programs to share the same transport.
+ * The Sun NFSv2/v3 ACL protocol can do this.
+ */
+struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old,
+				      struct rpc_program *program,
+				      int vers)
+{
+	struct rpc_clnt *clnt;
+	struct rpc_version *version;
+	int err;
+
+	BUG_ON(vers >= program->nrvers || !program->version[vers]);
+	version = program->version[vers];
+	clnt = rpc_clone_client(old);
+	if (IS_ERR(clnt))
+		goto out;
+	clnt->cl_procinfo = version->procs;
+	clnt->cl_maxproc  = version->nrprocs;
+	clnt->cl_protname = program->name;
+	clnt->cl_prog     = program->number;
+	clnt->cl_vers     = version->number;
+	clnt->cl_stats    = program->stats;
+	err = rpc_ping(clnt, RPC_TASK_SOFT|RPC_TASK_NOINTR);
+	if (err != 0) {
+		rpc_shutdown_client(clnt);
+		clnt = ERR_PTR(err);
+	}
+out:	
+	return clnt;
+}
+
 /*
  * Default callback for async RPC calls
  */
diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c
index df4d84c9020d..4e81f2766923 100644
--- a/net/sunrpc/pmap_clnt.c
+++ b/net/sunrpc/pmap_clnt.c
@@ -53,6 +53,9 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt)
 			task->tk_pid, clnt->cl_server,
 			map->pm_prog, map->pm_vers, map->pm_prot);
 
+	/* Autobind on cloned rpc clients is discouraged */
+	BUG_ON(clnt->cl_parent != clnt);
+
 	spin_lock(&pmap_lock);
 	if (map->pm_binding) {
 		rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL);
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 1b0ff7e0e869..d8673f66acc3 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -42,6 +42,7 @@ EXPORT_SYMBOL(rpc_release_task);
 /* RPC client functions */
 EXPORT_SYMBOL(rpc_create_client);
 EXPORT_SYMBOL(rpc_clone_client);
+EXPORT_SYMBOL(rpc_bind_new_program);
 EXPORT_SYMBOL(rpc_destroy_client);
 EXPORT_SYMBOL(rpc_shutdown_client);
 EXPORT_SYMBOL(rpc_release_client);
-- 
cgit v1.2.3-59-g8ed1b


From e053d1ab62c8ef0eff3dd4c95448cad3c6d2fbf4 Mon Sep 17 00:00:00 2001
From: Olaf Kirch <okir@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Lazy RPC receive buffer allocation

 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  2 +-
 net/sunrpc/xdr.c           | 16 +++++++++++++---
 net/sunrpc/xprt.c          | 26 ++++++++++++++++++++++----
 3 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 541dcf838abf..0f5b7a5a7432 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -160,7 +160,7 @@ typedef struct {
 
 typedef size_t (*skb_read_actor_t)(skb_reader_t *desc, void *to, size_t len);
 
-extern void xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
+extern int xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
 		skb_reader_t *, skb_read_actor_t);
 
 struct socket;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index f86d1baa6302..65b268d39782 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,7 +176,7 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-void
+int
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
@@ -190,7 +190,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		base = 0;
 	} else
 		base -= len;
@@ -210,6 +210,14 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 	do {
 		char *kaddr;
 
+		/* ACL likes to be lazy in allocating pages - ACLs
+		 * are small by default but can get huge. */
+		if (unlikely(*ppage == NULL)) {
+			*ppage = alloc_page(GFP_ATOMIC);
+			if (unlikely(*ppage == NULL))
+				return -ENOMEM;
+		}
+
 		len = PAGE_CACHE_SIZE;
 		kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA);
 		if (base) {
@@ -226,13 +234,15 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
 		if (ret != len || !desc->count)
-			return;
+			return 0;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
 		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+
+	return 0;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index c74a6bb94074..a180ed4952d6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -725,7 +725,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		goto no_checksum;
 
 	desc.csum = csum_partial(skb->data, desc.offset, skb->csum);
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0)
+		return -1;
 	if (desc.offset != skb->len) {
 		unsigned int csum2;
 		csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0);
@@ -737,7 +738,8 @@ csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 		return -1;
 	return 0;
 no_checksum:
-	xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits);
+	if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0)
+		return -1;
 	if (desc.count)
 		return -1;
 	return 0;
@@ -907,6 +909,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
+	int r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -927,16 +930,30 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		len = xprt->tcp_reclen - xprt->tcp_offset;
 		memcpy(&my_desc, desc, sizeof(my_desc));
 		my_desc.count = len;
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
 		desc->count -= len;
 		desc->offset += len;
 	} else
-		xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
+		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
 	xprt->tcp_copied += len;
 	xprt->tcp_offset += len;
 
+	if (r < 0) {
+		/* Error when copying to the receive buffer,
+		 * usually because we weren't able to allocate
+		 * additional buffer pages. All we can do now
+		 * is turn off XPRT_COPY_DATA, so the request
+		 * will not receive any additional updates,
+		 * and time out.
+		 * Any remaining data from this record will
+		 * be discarded.
+		 */
+		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		goto out;
+	}
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -949,6 +966,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
+out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 7e06b53d796a3740307b54aa2799077f8a0c84e7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: fix accounting bug in the case of a truncated RPC
 message

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  2 +-
 net/sunrpc/xdr.c           | 22 ++++++++++++++--------
 net/sunrpc/xprt.c          | 35 +++++++++++++++++++++++++++--------
 3 files changed, 42 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 0f5b7a5a7432..5d1eed2b58a1 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -160,7 +160,7 @@ typedef struct {
 
 typedef size_t (*skb_read_actor_t)(skb_reader_t *desc, void *to, size_t len);
 
-extern int xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
+extern ssize_t xdr_partial_copy_from_skb(struct xdr_buf *, unsigned int,
 		skb_reader_t *, skb_read_actor_t);
 
 struct socket;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 65b268d39782..b3ac3f72bf9c 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -176,21 +176,23 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
 	xdr->buflen += len;
 }
 
-int
+ssize_t
 xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 			  skb_reader_t *desc,
 			  skb_read_actor_t copy_actor)
 {
 	struct page	**ppage = xdr->pages;
 	unsigned int	len, pglen = xdr->page_len;
+	ssize_t		copied = 0;
 	int		ret;
 
 	len = xdr->head[0].iov_len;
 	if (base < len) {
 		len -= base;
 		ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		base = 0;
 	} else
 		base -= len;
@@ -214,8 +216,11 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		 * are small by default but can get huge. */
 		if (unlikely(*ppage == NULL)) {
 			*ppage = alloc_page(GFP_ATOMIC);
-			if (unlikely(*ppage == NULL))
-				return -ENOMEM;
+			if (unlikely(*ppage == NULL)) {
+				if (copied == 0)
+					copied = -ENOMEM;
+				goto out;
+			}
 		}
 
 		len = PAGE_CACHE_SIZE;
@@ -233,16 +238,17 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base,
 		}
 		flush_dcache_page(*ppage);
 		kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA);
+		copied += ret;
 		if (ret != len || !desc->count)
-			return 0;
+			goto out;
 		ppage++;
 	} while ((pglen -= len) != 0);
 copy_tail:
 	len = xdr->tail[0].iov_len;
 	if (base < len)
-		copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
-
-	return 0;
+		copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base);
+out:
+	return copied;
 }
 
 
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index a180ed4952d6..ef941e7de8bf 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -823,10 +823,15 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len)
 {
 	if (len > desc->count)
 		len = desc->count;
-	if (skb_copy_bits(desc->skb, desc->offset, p, len))
+	if (skb_copy_bits(desc->skb, desc->offset, p, len)) {
+		dprintk("RPC:      failed to copy %zu bytes from skb. %zu bytes remain\n",
+				len, desc->count);
 		return 0;
+	}
 	desc->offset += len;
 	desc->count -= len;
+	dprintk("RPC:      copied %zu bytes from skb. %zu bytes remain\n",
+			len, desc->count);
 	return len;
 }
 
@@ -865,6 +870,8 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc)
 static void
 tcp_check_recm(struct rpc_xprt *xprt)
 {
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags);
 	if (xprt->tcp_offset == xprt->tcp_reclen) {
 		xprt->tcp_flags |= XPRT_COPY_RECM;
 		xprt->tcp_offset = 0;
@@ -909,7 +916,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 	struct rpc_rqst *req;
 	struct xdr_buf *rcvbuf;
 	size_t len;
-	int r;
+	ssize_t r;
 
 	/* Find and lock the request corresponding to this xid */
 	spin_lock(&xprt->sock_lock);
@@ -932,15 +939,17 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		my_desc.count = len;
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  &my_desc, tcp_copy_data);
-		desc->count -= len;
-		desc->offset += len;
+		desc->count -= r;
+		desc->offset += r;
 	} else
 		r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied,
 					  desc, tcp_copy_data);
-	xprt->tcp_copied += len;
-	xprt->tcp_offset += len;
 
-	if (r < 0) {
+	if (r > 0) {
+		xprt->tcp_copied += r;
+		xprt->tcp_offset += r;
+	}
+	if (r != len) {
 		/* Error when copying to the receive buffer,
 		 * usually because we weren't able to allocate
 		 * additional buffer pages. All we can do now
@@ -951,9 +960,18 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 		 * be discarded.
 		 */
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
+		dprintk("RPC:      XID %08x truncated request\n",
+				ntohl(xprt->tcp_xid));
+		dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+				xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
 		goto out;
 	}
 
+	dprintk("RPC:      XID %08x read %u bytes\n",
+			ntohl(xprt->tcp_xid), r);
+	dprintk("RPC:      xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n",
+			xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen);
+
 	if (xprt->tcp_copied == req->rq_private_buf.buflen)
 		xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	else if (xprt->tcp_offset == xprt->tcp_reclen) {
@@ -961,12 +979,12 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc)
 			xprt->tcp_flags &= ~XPRT_COPY_DATA;
 	}
 
+out:
 	if (!(xprt->tcp_flags & XPRT_COPY_DATA)) {
 		dprintk("RPC: %4d received reply complete\n",
 				req->rq_task->tk_pid);
 		xprt_complete_rqst(xprt, req, xprt->tcp_copied);
 	}
-out:
 	spin_unlock(&xprt->sock_lock);
 	tcp_check_recm(xprt);
 }
@@ -985,6 +1003,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc)
 	desc->count -= len;
 	desc->offset += len;
 	xprt->tcp_offset += len;
+	dprintk("RPC:      discarded %u bytes\n", len);
 	tcp_check_recm(xprt);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From bd8100e7eda87507649c6ba4cb32173b34e49986 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Encode and decode arbitrary XDR arrays

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xdr.h |  19 +++-
 net/sunrpc/sunrpc_syms.c   |   4 +
 net/sunrpc/xdr.c           | 256 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 275 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5d1eed2b58a1..34ec3e8d99b3 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -146,7 +146,8 @@ extern void xdr_shift_buf(struct xdr_buf *, size_t);
 extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, int, int);
 extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, int);
-extern int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len);
+extern int read_bytes_from_xdr_buf(struct xdr_buf *, int, void *, int);
+extern int write_bytes_to_xdr_buf(struct xdr_buf *, int, void *, int);
 
 /*
  * Helper structure for copying from an sk_buff.
@@ -168,6 +169,22 @@ struct sockaddr;
 extern int xdr_sendpages(struct socket *, struct sockaddr *, int,
 		struct xdr_buf *, unsigned int, int);
 
+extern int xdr_encode_word(struct xdr_buf *, int, u32);
+extern int xdr_decode_word(struct xdr_buf *, int, u32 *);
+
+struct xdr_array2_desc;
+typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
+struct xdr_array2_desc {
+	unsigned int elem_size;
+	unsigned int array_len;
+	xdr_xcode_elem_t xcode;
+};
+
+extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+                             struct xdr_array2_desc *desc);
+extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+			     struct xdr_array2_desc *desc);
+
 /*
  * Provide some simple tools for XDR buffer overflow-checking etc.
  */
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index d8673f66acc3..32e8acbc60fe 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -129,6 +129,10 @@ EXPORT_SYMBOL(xdr_encode_netobj);
 EXPORT_SYMBOL(xdr_encode_pages);
 EXPORT_SYMBOL(xdr_inline_pages);
 EXPORT_SYMBOL(xdr_shift_buf);
+EXPORT_SYMBOL(xdr_encode_word);
+EXPORT_SYMBOL(xdr_decode_word);
+EXPORT_SYMBOL(xdr_encode_array2);
+EXPORT_SYMBOL(xdr_decode_array2);
 EXPORT_SYMBOL(xdr_buf_from_iov);
 EXPORT_SYMBOL(xdr_buf_subsegment);
 EXPORT_SYMBOL(xdr_buf_read_netobj);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index b3ac3f72bf9c..8a4d9c106af1 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -887,8 +887,34 @@ out:
 	return status;
 }
 
-static int
-read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+/* obj is assumed to point to allocated memory of size at least len: */
+int
+write_bytes_to_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
+{
+	struct xdr_buf subbuf;
+	int this_len;
+	int status;
+
+	status = xdr_buf_subsegment(buf, &subbuf, base, len);
+	if (status)
+		goto out;
+	this_len = min(len, (int)subbuf.head[0].iov_len);
+	memcpy(subbuf.head[0].iov_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.page_len);
+	if (this_len)
+		_copy_to_pages(subbuf.pages, subbuf.page_base, obj, this_len);
+	len -= this_len;
+	obj += this_len;
+	this_len = min(len, (int)subbuf.tail[0].iov_len);
+	memcpy(subbuf.tail[0].iov_base, obj, this_len);
+out:
+	return status;
+}
+
+int
+xdr_decode_word(struct xdr_buf *buf, int base, u32 *obj)
 {
 	u32	raw;
 	int	status;
@@ -900,6 +926,14 @@ read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
 	return 0;
 }
 
+int
+xdr_encode_word(struct xdr_buf *buf, int base, u32 obj)
+{
+	u32	raw = htonl(obj);
+
+	return write_bytes_to_xdr_buf(buf, base, &raw, sizeof(obj));
+}
+
 /* If the netobj starting offset bytes from the start of xdr_buf is contained
  * entirely in the head or the tail, set object to point to it; otherwise
  * try to find space for it at the end of the tail, copy it there, and
@@ -910,7 +944,7 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 	u32	tail_offset = buf->head[0].iov_len + buf->page_len;
 	u32	obj_end_offset;
 
-	if (read_u32_from_xdr_buf(buf, offset, &obj->len))
+	if (xdr_decode_word(buf, offset, &obj->len))
 		goto out;
 	obj_end_offset = offset + 4 + obj->len;
 
@@ -943,3 +977,219 @@ xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset)
 out:
 	return -1;
 }
+
+/* Returns 0 on success, or else a negative error code. */
+static int
+xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
+		 struct xdr_array2_desc *desc, int encode)
+{
+	char *elem = NULL, *c;
+	unsigned int copied = 0, todo, avail_here;
+	struct page **ppages = NULL;
+	int err;
+
+	if (encode) {
+		if (xdr_encode_word(buf, base, desc->array_len) != 0)
+			return -EINVAL;
+	} else {
+		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    (unsigned long) base + 4 + desc->array_len *
+				    desc->elem_size > buf->len)
+			return -EINVAL;
+	}
+	base += 4;
+
+	if (!desc->xcode)
+		return 0;
+
+	todo = desc->array_len * desc->elem_size;
+
+	/* process head */
+	if (todo && base < buf->head->iov_len) {
+		c = buf->head->iov_base + base;
+		avail_here = min_t(unsigned int, todo,
+				   buf->head->iov_len - base);
+		todo -= avail_here;
+
+		while (avail_here >= desc->elem_size) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			avail_here -= desc->elem_size;
+		}
+		if (avail_here) {
+			if (!elem) {
+				elem = kmalloc(desc->elem_size, GFP_KERNEL);
+				err = -ENOMEM;
+				if (!elem)
+					goto out;
+			}
+			if (encode) {
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+				memcpy(c, elem, avail_here);
+			} else
+				memcpy(elem, c, avail_here);
+			copied = avail_here;
+		}
+		base = buf->head->iov_len;  /* align to start of pages */
+	}
+
+	/* process pages array */
+	base -= buf->head->iov_len;
+	if (todo && base < buf->page_len) {
+		unsigned int avail_page;
+
+		avail_here = min(todo, buf->page_len - base);
+		todo -= avail_here;
+
+		base += buf->page_base;
+		ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
+		base &= ~PAGE_CACHE_MASK;
+		avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+					avail_here);
+		c = kmap(*ppages) + base;
+
+		while (avail_here) {
+			avail_here -= avail_page;
+			if (copied || avail_page < desc->elem_size) {
+				unsigned int l = min(avail_page,
+					desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+				avail_page -= l;
+				c += l;
+			}
+			while (avail_page >= desc->elem_size) {
+				err = desc->xcode(desc, c);
+				if (err)
+					goto out;
+				c += desc->elem_size;
+				avail_page -= desc->elem_size;
+			}
+			if (avail_page) {
+				unsigned int l = min(avail_page,
+					    desc->elem_size - copied);
+				if (!elem) {
+					elem = kmalloc(desc->elem_size,
+						       GFP_KERNEL);
+					err = -ENOMEM;
+					if (!elem)
+						goto out;
+				}
+				if (encode) {
+					if (!copied) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+					}
+					memcpy(c, elem + copied, l);
+					copied += l;
+					if (copied == desc->elem_size)
+						copied = 0;
+				} else {
+					memcpy(elem + copied, c, l);
+					copied += l;
+					if (copied == desc->elem_size) {
+						err = desc->xcode(desc, elem);
+						if (err)
+							goto out;
+						copied = 0;
+					}
+				}
+			}
+			if (avail_here) {
+				kunmap(*ppages);
+				ppages++;
+				c = kmap(*ppages);
+			}
+
+			avail_page = min(avail_here,
+				 (unsigned int) PAGE_CACHE_SIZE);
+		}
+		base = buf->page_len;  /* align to start of tail */
+	}
+
+	/* process tail */
+	base -= buf->page_len;
+	if (todo) {
+		c = buf->tail->iov_base + base;
+		if (copied) {
+			unsigned int l = desc->elem_size - copied;
+
+			if (encode)
+				memcpy(c, elem + copied, l);
+			else {
+				memcpy(elem + copied, c, l);
+				err = desc->xcode(desc, elem);
+				if (err)
+					goto out;
+			}
+			todo -= l;
+			c += l;
+		}
+		while (todo) {
+			err = desc->xcode(desc, c);
+			if (err)
+				goto out;
+			c += desc->elem_size;
+			todo -= desc->elem_size;
+		}
+	}
+	err = 0;
+
+out:
+	if (elem)
+		kfree(elem);
+	if (ppages)
+		kunmap(*ppages);
+	return err;
+}
+
+int
+xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if (base >= buf->len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 0);
+}
+
+int
+xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
+		  struct xdr_array2_desc *desc)
+{
+	if ((unsigned long) base + 4 + desc->array_len * desc->elem_size >
+	    buf->head->iov_len + buf->page_len + buf->tail->iov_len)
+		return -EINVAL;
+
+	return xdr_xcode_array2(buf, base, desc, 1);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 9ba02638e4be28dd4ff724202a640264427c62d1 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:24 +0000
Subject: [PATCH] RPC: Allow the sunrpc server to multiplex serveral programs
 on a single port

 The NFS and NFSACL programs run on the same RPC transport.  This patch adds
 support for this by converting svc_program into a chained list of programs
 (server-side).

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Signed-off-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/svc.h |  3 ++-
 net/sunrpc/svc.c           | 35 ++++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 37003970cf2e..facb94488bb1 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -240,9 +240,10 @@ struct svc_deferred_req {
 };
 
 /*
- * RPC program
+ * List of RPC programs on the same transport endpoint
  */
 struct svc_program {
+	struct svc_program *	pg_next;	/* other programs (same xprt) */
 	u32			pg_prog;	/* program number */
 	unsigned int		pg_lovers;	/* lowest version */
 	unsigned int		pg_hivers;	/* lowest version */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a02d424a7409..e9bd91265f70 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -35,20 +35,24 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
 	memset(serv, 0, sizeof(*serv));
+	serv->sv_name      = prog->pg_name;
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
 	serv->sv_stats     = prog->pg_stats;
 	serv->sv_bufsz	   = bufsize? bufsize : 4096;
-	prog->pg_lovers = prog->pg_nvers-1;
 	xdrsize = 0;
-	for (vers=0; vers<prog->pg_nvers ; vers++)
-		if (prog->pg_vers[vers]) {
-			prog->pg_hivers = vers;
-			if (prog->pg_lovers > vers)
-				prog->pg_lovers = vers;
-			if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
-				xdrsize = prog->pg_vers[vers]->vs_xdrsize;
-		}
+	while (prog) {
+		prog->pg_lovers = prog->pg_nvers-1;
+		for (vers=0; vers<prog->pg_nvers ; vers++)
+			if (prog->pg_vers[vers]) {
+				prog->pg_hivers = vers;
+				if (prog->pg_lovers > vers)
+					prog->pg_lovers = vers;
+				if (prog->pg_vers[vers]->vs_xdrsize > xdrsize)
+					xdrsize = prog->pg_vers[vers]->vs_xdrsize;
+			}
+		prog = prog->pg_next;
+	}
 	serv->sv_xdrsize   = xdrsize;
 	INIT_LIST_HEAD(&serv->sv_threads);
 	INIT_LIST_HEAD(&serv->sv_sockets);
@@ -56,8 +60,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 	INIT_LIST_HEAD(&serv->sv_permsocks);
 	spin_lock_init(&serv->sv_lock);
 
-	serv->sv_name      = prog->pg_name;
-
 	/* Remove any stale portmap registrations */
 	svc_register(serv, 0, 0);
 
@@ -339,7 +341,10 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 		goto sendit;
 	}
 		
-	if (prog != progp->pg_prog)
+	for (progp = serv->sv_program; progp; progp = progp->pg_next)
+		if (prog == progp->pg_prog)
+			break;
+	if (progp == NULL)
 		goto err_bad_prog;
 
 	if (vers >= progp->pg_nvers ||
@@ -452,11 +457,7 @@ err_bad_auth:
 	goto sendit;
 
 err_bad_prog:
-#ifdef RPC_PARANOIA
-	if (prog != 100227 || progp->pg_prog != 100003)
-		printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog);
-	/* else it is just a Solaris client seeing if ACLs are supported */
-#endif
+	dprintk("svc: unknown program %d\n", prog);
 	serv->sv_stats->rpcbadfmt++;
 	svc_putu32(resv, rpc_prog_unavail);
 	goto sendit;
-- 
cgit v1.2.3-59-g8ed1b


From a257cdd0e2179630d3201c32ba14d7fcb3c3a055 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:26 +0000
Subject: [PATCH] NFSD: Add server support for NFSv3 ACLs.

 This adds functions for encoding and decoding POSIX ACLs for the NFSACL
 protocol extension, and the GETACL and SETACL RPCs.  The implementation is
 compatible with NFSACL in Solaris.

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig                 |  24 ++++
 fs/Makefile                |   1 +
 fs/nfs_common/Makefile     |   7 +
 fs/nfs_common/nfsacl.c     | 257 ++++++++++++++++++++++++++++++++++
 fs/nfsd/Makefile           |   2 +
 fs/nfsd/nfs2acl.c          | 336 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3acl.c          | 267 +++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs3xdr.c          |  13 ++
 fs/nfsd/nfssvc.c           |  27 ++++
 fs/nfsd/nfsxdr.c           |  11 ++
 fs/nfsd/vfs.c              | 107 ++++++++++++++-
 include/linux/nfsacl.h     |  58 ++++++++
 include/linux/nfsd/nfsd.h  |  16 +++
 include/linux/nfsd/xdr.h   |   4 +
 include/linux/nfsd/xdr3.h  |  26 ++++
 include/linux/sunrpc/svc.h |  11 ++
 16 files changed, 1166 insertions(+), 1 deletion(-)
 create mode 100644 fs/nfs_common/Makefile
 create mode 100644 fs/nfs_common/nfsacl.c
 create mode 100644 fs/nfsd/nfs2acl.c
 create mode 100644 fs/nfsd/nfs3acl.c
 create mode 100644 include/linux/nfsacl.h

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 178e27494b74..d44b04d9b0a9 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1353,6 +1353,7 @@ config NFSD
 	select LOCKD
 	select SUNRPC
 	select EXPORTFS
+	select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
 	help
 	  If you want your Linux box to act as an NFS *server*, so that other
 	  computers on your local network which support NFS can access certain
@@ -1376,6 +1377,10 @@ config NFSD
 	  To compile the NFS server support as a module, choose M here: the
 	  module will be called nfsd.  If unsure, say N.
 
+config NFSD_V2_ACL
+	bool
+	depends on NFSD
+
 config NFSD_V3
 	bool "Provide NFSv3 server support"
 	depends on NFSD
@@ -1383,6 +1388,16 @@ config NFSD_V3
 	  If you would like to include the NFSv3 server as well as the NFSv2
 	  server, say Y here.  If unsure, say Y.
 
+config NFSD_V3_ACL
+	bool "Provide server support for the NFSv3 ACL protocol extension"
+	depends on NFSD_V3
+	select NFSD_V2_ACL
+	help
+	  Implement the NFSv3 ACL protocol extension for manipulating POSIX
+	  Access Control Lists on exported file systems. NFS clients should
+	  be compiled with the NFSv3 ACL protocol extension; see the
+	  CONFIG_NFS_V3_ACL option.  If unsure, say N.
+
 config NFSD_V4
 	bool "Provide NFSv4 server support (EXPERIMENTAL)"
 	depends on NFSD_V3 && EXPERIMENTAL
@@ -1427,6 +1442,15 @@ config LOCKD_V4
 config EXPORTFS
 	tristate
 
+config NFS_ACL_SUPPORT
+	tristate
+	select FS_POSIX_ACL
+
+config NFS_COMMON
+	bool
+	depends on NFSD || NFS_FS
+	default y
+
 config SUNRPC
 	tristate
 
diff --git a/fs/Makefile b/fs/Makefile
index 443f2bc56ccf..fc92e59e9faf 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_BINFMT_FLAT)	+= binfmt_flat.o
 
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
+obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 
 obj-$(CONFIG_QUOTA)		+= dquot.o
 obj-$(CONFIG_QFMT_V1)		+= quota_v1.o
diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
new file mode 100644
index 000000000000..f689ed82af3a
--- /dev/null
+++ b/fs/nfs_common/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for Linux filesystem routines that are shared by client and server.
+#
+
+obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
+
+nfs_acl-objs := nfsacl.o
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
new file mode 100644
index 000000000000..18c58c32e326
--- /dev/null
+++ b/fs/nfs_common/nfsacl.c
@@ -0,0 +1,257 @@
+/*
+ * fs/nfs_common/nfsacl.c
+ *
+ *  Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+/*
+ * The Solaris nfsacl protocol represents some ACLs slightly differently
+ * than POSIX 1003.1e draft 17 does (and we do):
+ *
+ *  - Minimal ACLs always have an ACL_MASK entry, so they have
+ *    four instead of three entries.
+ *  - The ACL_MASK entry in such minimal ACLs always has the same
+ *    permissions as the ACL_GROUP_OBJ entry. (In extended ACLs
+ *    the ACL_MASK and ACL_GROUP_OBJ entries may differ.)
+ *  - The identifier fields of the ACL_USER_OBJ and ACL_GROUP_OBJ
+ *    entries contain the identifiers of the owner and owning group.
+ *    (In POSIX ACLs we always set them to ACL_UNDEFINED_ID).
+ *  - ACL entries in the kernel are kept sorted in ascending order
+ *    of (e_tag, e_id). Solaris ACLs are unsorted.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/sunrpc/xdr.h>
+#include <linux/nfsacl.h>
+#include <linux/nfs3.h>
+#include <linux/sort.h>
+
+MODULE_LICENSE("GPL");
+
+EXPORT_SYMBOL(nfsacl_encode);
+EXPORT_SYMBOL(nfsacl_decode);
+
+struct nfsacl_encode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+	int typeflag;
+	uid_t uid;
+	gid_t gid;
+};
+
+static int
+xdr_nfsace_encode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_encode_desc *nfsacl_desc =
+		(struct nfsacl_encode_desc *) desc;
+	u32 *p = (u32 *) elem;
+
+	if (nfsacl_desc->count < nfsacl_desc->acl->a_count) {
+		struct posix_acl_entry *entry =
+			&nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+
+		*p++ = htonl(entry->e_tag | nfsacl_desc->typeflag);
+		switch(entry->e_tag) {
+			case ACL_USER_OBJ:
+				*p++ = htonl(nfsacl_desc->uid);
+				break;
+			case ACL_GROUP_OBJ:
+				*p++ = htonl(nfsacl_desc->gid);
+				break;
+			case ACL_USER:
+			case ACL_GROUP:
+				*p++ = htonl(entry->e_id);
+				break;
+			default:  /* Solaris depends on that! */
+				*p++ = 0;
+				break;
+		}
+		*p++ = htonl(entry->e_perm & S_IRWXO);
+	} else {
+		const struct posix_acl_entry *pa, *pe;
+		int group_obj_perm = ACL_READ|ACL_WRITE|ACL_EXECUTE;
+
+		FOREACH_ACL_ENTRY(pa, nfsacl_desc->acl, pe) {
+			if (pa->e_tag == ACL_GROUP_OBJ) {
+				group_obj_perm = pa->e_perm & S_IRWXO;
+				break;
+			}
+		}
+		/* fake up ACL_MASK entry */
+		*p++ = htonl(ACL_MASK | nfsacl_desc->typeflag);
+		*p++ = htonl(0);
+		*p++ = htonl(group_obj_perm);
+	}
+
+	return 0;
+}
+
+unsigned int
+nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+	      struct posix_acl *acl, int encode_entries, int typeflag)
+{
+	int entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
+	struct nfsacl_encode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.array_len = encode_entries ? entries : 0,
+			.xcode = xdr_nfsace_encode,
+		},
+		.acl = acl,
+		.typeflag = typeflag,
+		.uid = inode->i_uid,
+		.gid = inode->i_gid,
+	};
+	int err;
+
+	if (entries > NFS_ACL_MAX_ENTRIES ||
+	    xdr_encode_word(buf, base, entries))
+		return -EINVAL;
+	err = xdr_encode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (!err)
+		err = 8 + nfsacl_desc.desc.elem_size *
+			  nfsacl_desc.desc.array_len;
+	return err;
+}
+
+struct nfsacl_decode_desc {
+	struct xdr_array2_desc desc;
+	unsigned int count;
+	struct posix_acl *acl;
+};
+
+static int
+xdr_nfsace_decode(struct xdr_array2_desc *desc, void *elem)
+{
+	struct nfsacl_decode_desc *nfsacl_desc =
+		(struct nfsacl_decode_desc *) desc;
+	u32 *p = (u32 *) elem;
+	struct posix_acl_entry *entry;
+
+	if (!nfsacl_desc->acl) {
+		if (desc->array_len > NFS_ACL_MAX_ENTRIES)
+			return -EINVAL;
+		nfsacl_desc->acl = posix_acl_alloc(desc->array_len, GFP_KERNEL);
+		if (!nfsacl_desc->acl)
+			return -ENOMEM;
+		nfsacl_desc->count = 0;
+	}
+
+	entry = &nfsacl_desc->acl->a_entries[nfsacl_desc->count++];
+	entry->e_tag = ntohl(*p++) & ~NFS_ACL_DEFAULT;
+	entry->e_id = ntohl(*p++);
+	entry->e_perm = ntohl(*p++);
+
+	switch(entry->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_USER:
+		case ACL_GROUP_OBJ:
+		case ACL_GROUP:
+		case ACL_OTHER:
+			if (entry->e_perm & ~S_IRWXO)
+				return -EINVAL;
+			break;
+		case ACL_MASK:
+			/* Solaris sometimes sets additonal bits in the mask */
+			entry->e_perm &= S_IRWXO;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+cmp_acl_entry(const void *x, const void *y)
+{
+	const struct posix_acl_entry *a = x, *b = y;
+
+	if (a->e_tag != b->e_tag)
+		return a->e_tag - b->e_tag;
+	else if (a->e_id > b->e_id)
+		return 1;
+	else if (a->e_id < b->e_id)
+		return -1;
+	else
+		return 0;
+}
+
+/*
+ * Convert from a Solaris ACL to a POSIX 1003.1e draft 17 ACL.
+ */
+static int
+posix_acl_from_nfsacl(struct posix_acl *acl)
+{
+	struct posix_acl_entry *pa, *pe,
+	       *group_obj = NULL, *mask = NULL;
+
+	if (!acl)
+		return 0;
+
+	sort(acl->a_entries, acl->a_count, sizeof(struct posix_acl_entry),
+	     cmp_acl_entry, NULL);
+
+	/* Clear undefined identifier fields and find the ACL_GROUP_OBJ
+	   and ACL_MASK entries. */
+	FOREACH_ACL_ENTRY(pa, acl, pe) {
+		switch(pa->e_tag) {
+			case ACL_USER_OBJ:
+				pa->e_id = ACL_UNDEFINED_ID;
+				break;
+			case ACL_GROUP_OBJ:
+				pa->e_id = ACL_UNDEFINED_ID;
+				group_obj = pa;
+				break;
+			case ACL_MASK:
+				mask = pa;
+				/* fall through */
+			case ACL_OTHER:
+				pa->e_id = ACL_UNDEFINED_ID;
+				break;
+		}
+	}
+	if (acl->a_count == 4 && group_obj && mask &&
+	    mask->e_perm == group_obj->e_perm) {
+		/* remove bogus ACL_MASK entry */
+		memmove(mask, mask+1, (3 - (mask - acl->a_entries)) *
+				      sizeof(struct posix_acl_entry));
+		acl->a_count = 3;
+	}
+	return 0;
+}
+
+unsigned int
+nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+	      struct posix_acl **pacl)
+{
+	struct nfsacl_decode_desc nfsacl_desc = {
+		.desc = {
+			.elem_size = 12,
+			.xcode = pacl ? xdr_nfsace_decode : NULL,
+		},
+	};
+	u32 entries;
+	int err;
+
+	if (xdr_decode_word(buf, base, &entries) ||
+	    entries > NFS_ACL_MAX_ENTRIES)
+		return -EINVAL;
+	err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc);
+	if (err)
+		return err;
+	if (pacl) {
+		if (entries != nfsacl_desc.desc.array_len ||
+		    posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
+			posix_acl_release(nfsacl_desc.acl);
+			return -EINVAL;
+		}
+		*pacl = nfsacl_desc.acl;
+	}
+	if (aclcnt)
+		*aclcnt = entries;
+	return 8 + nfsacl_desc.desc.elem_size *
+		   nfsacl_desc.desc.array_len;
+}
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index b8680a247f8b..9f043f44c92f 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -6,7 +6,9 @@ obj-$(CONFIG_NFSD)	+= nfsd.o
 
 nfsd-y 			:= nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
 			   export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
+nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
+nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
 			   nfs4acl.o nfs4callback.o
 nfsd-objs		:= $(nfsd-y)
diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
new file mode 100644
index 000000000000..7cbf0682b2f0
--- /dev/null
+++ b/fs/nfsd/nfs2acl.c
@@ -0,0 +1,336 @@
+/*
+ * linux/fs/nfsd/nfsacl.c
+ *
+ * Process version 2 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr.h>
+#include <linux/nfsd/xdr3.h>
+#include <linux/posix_acl.h>
+#include <linux/nfsacl.h>
+
+#define NFSDDBG_FACILITY		NFSDDBG_PROC
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static int
+nfsacld_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static int nfsacld_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	svc_fh *fh;
+	struct posix_acl *acl;
+	int nfserr = 0;
+
+	dprintk("nfsd: GETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+		RETURN_STATUS(nfserr_inval);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+
+			struct inode *inode = fh->fh_dentry->d_inode;
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfssvc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static int nfsacld_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd_attrstat *resp)
+{
+	svc_fh *fh;
+	int nfserr = 0;
+
+	dprintk("nfsd: SETACL(2acl)   %s\n", SVCFH_fmt(&argp->fh));
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_ACCESS, argp->acl_access) );
+	}
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+	}
+
+	/* argp->acl_{access,default} may have been allocated in
+	   nfssvc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	return nfserr;
+}
+
+/*
+ * Check file attributes
+ */
+static int nfsacld_proc_getattr(struct svc_rqst * rqstp,
+		struct nfsd_fhandle *argp, struct nfsd_attrstat *resp)
+{
+	dprintk("nfsd: GETATTR  %s\n", SVCFH_fmt(&argp->fh));
+
+	fh_copy(&resp->fh, &argp->fh);
+	return fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+}
+
+/*
+ * Check file access
+ */
+static int nfsacld_proc_access(struct svc_rqst *rqstp, struct nfsd3_accessargs *argp,
+		struct nfsd3_accessres *resp)
+{
+	int nfserr;
+
+	dprintk("nfsd: ACCESS(2acl)   %s 0x%x\n",
+			SVCFH_fmt(&argp->fh),
+			argp->access);
+
+	fh_copy(&resp->fh, &argp->fh);
+	resp->access = argp->access;
+	nfserr = nfsd_access(rqstp, &resp->fh, &resp->access, NULL);
+	return nfserr;
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclargs *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_setaclargs *argp)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->mask = ntohl(*p++);
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (argp->mask & NFS_ACL) ?
+			  &argp->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (argp->mask & NFS_DFACL) ?
+				  &argp->acl_default : NULL);
+	return (n > 0);
+}
+
+static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd_fhandle *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	return xdr_argsize_check(rqstp, p);
+}
+
+static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_accessargs *argp)
+{
+	if (!(p = nfs2svc_decode_fh(p, &argp->fh)))
+		return 0;
+	argp->access = ntohl(*p++);
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/* GETACL */
+static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+	struct inode *inode = dentry->d_inode;
+	int w = nfsacl_size(
+		(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+		(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+	struct kvec *head = rqstp->rq_res.head;
+	unsigned int base;
+	int n;
+
+	if (dentry == NULL || dentry->d_inode == NULL)
+		return 0;
+	inode = dentry->d_inode;
+
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	*p++ = htonl(resp->mask);
+	if (!xdr_ressize_check(rqstp, p))
+		return 0;
+	base = (char *)p - (char *)head->iov_base;
+
+	rqstp->rq_res.page_len = w;
+	while (w > 0) {
+		if (!svc_take_res_page(rqstp))
+			return 0;
+		w -= PAGE_SIZE;
+	}
+
+	n = nfsacl_encode(&rqstp->rq_res, base, inode,
+			  resp->acl_access,
+			  resp->mask & NFS_ACL, 0);
+	if (n > 0)
+		n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+				  resp->acl_default,
+				  resp->mask & NFS_DFACL,
+				  NFS_ACL_DEFAULT);
+	if (n <= 0)
+		return 0;
+	return 1;
+}
+
+static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd_attrstat *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/* ACCESS */
+static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_accessres *resp)
+{
+	p = nfs2svc_encode_fattr(rqstp, p, &resp->fh);
+	*p++ = htonl(resp->access);
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfsaclsvc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+static int nfsaclsvc_release_fhandle(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd_fhandle *resp)
+{
+	fh_put(&resp->fh);
+	return 1;
+}
+
+#define nfsaclsvc_decode_voidargs	NULL
+#define nfsaclsvc_encode_voidres	NULL
+#define nfsaclsvc_release_void		NULL
+#define nfsd3_fhandleargs	nfsd_fhandle
+#define nfsd3_attrstatres	nfsd_attrstat
+#define nfsd3_voidres		nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsacld_proc_##name,		\
+   (kxdrproc_t) nfsaclsvc_decode_##argt##args,	\
+   (kxdrproc_t) nfsaclsvc_encode_##rest##res,	\
+   (kxdrproc_t) nfsaclsvc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures2[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		attrstat,	fhandle,  RC_NOCACHE, ST+AT),
+  PROC(getattr, fhandle,	attrstat,	fhandle,  RC_NOCACHE, ST+AT),
+  PROC(access,	access,		access,		fhandle,  RC_NOCACHE, ST+AT+1),
+};
+
+struct svc_version	nfsd_acl_version2 = {
+		.vs_vers	= 2,
+		.vs_nproc	= 5,
+		.vs_proc	= nfsd_acl_procedures2,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+};
diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
new file mode 100644
index 000000000000..64ba40572fea
--- /dev/null
+++ b/fs/nfsd/nfs3acl.c
@@ -0,0 +1,267 @@
+/*
+ * linux/fs/nfsd/nfs3acl.c
+ *
+ * Process version 3 NFSACL requests.
+ *
+ * Copyright (C) 2002-2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs3.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfsd/cache.h>
+#include <linux/nfsd/xdr3.h>
+#include <linux/posix_acl.h>
+#include <linux/nfsacl.h>
+
+#define RETURN_STATUS(st)	{ resp->status = (st); return (st); }
+
+/*
+ * NULL call.
+ */
+static int
+nfsd3_proc_null(struct svc_rqst *rqstp, void *argp, void *resp)
+{
+	return nfs_ok;
+}
+
+/*
+ * Get the Access and/or Default ACL of a file.
+ */
+static int nfsd3_proc_getacl(struct svc_rqst * rqstp,
+		struct nfsd3_getaclargs *argp, struct nfsd3_getaclres *resp)
+{
+	svc_fh *fh;
+	struct posix_acl *acl;
+	int nfserr = 0;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	if ((nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP)))
+		RETURN_STATUS(nfserr_inval);
+
+	if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		RETURN_STATUS(nfserr_inval);
+	resp->mask = argp->mask;
+
+	if (resp->mask & (NFS_ACL|NFS_ACLCNT)) {
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_ACCESS);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		if (acl == NULL) {
+			/* Solaris returns the inode's minimum ACL. */
+
+			struct inode *inode = fh->fh_dentry->d_inode;
+			acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		}
+		resp->acl_access = acl;
+	}
+	if (resp->mask & (NFS_DFACL|NFS_DFACLCNT)) {
+		/* Check how Solaris handles requests for the Default ACL
+		   of a non-directory! */
+
+		acl = nfsd_get_posix_acl(fh, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl)) {
+			int err = PTR_ERR(acl);
+
+			if (err == -ENODATA || err == -EOPNOTSUPP)
+				acl = NULL;
+			else {
+				nfserr = nfserrno(err);
+				goto fail;
+			}
+		}
+		resp->acl_default = acl;
+	}
+
+	/* resp->acl_{access,default} are released in nfs3svc_release_getacl. */
+	RETURN_STATUS(0);
+
+fail:
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * Set the Access and/or Default ACL of a file.
+ */
+static int nfsd3_proc_setacl(struct svc_rqst * rqstp,
+		struct nfsd3_setaclargs *argp,
+		struct nfsd3_attrstat *resp)
+{
+	svc_fh *fh;
+	int nfserr = 0;
+
+	fh = fh_copy(&resp->fh, &argp->fh);
+	nfserr = fh_verify(rqstp, &resp->fh, 0, MAY_NOP);
+
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_ACCESS, argp->acl_access) );
+	}
+	if (!nfserr) {
+		nfserr = nfserrno( nfsd_set_posix_acl(
+			fh, ACL_TYPE_DEFAULT, argp->acl_default) );
+	}
+
+	/* argp->acl_{access,default} may have been allocated in
+	   nfs3svc_decode_setaclargs. */
+	posix_acl_release(argp->acl_access);
+	posix_acl_release(argp->acl_default);
+	RETURN_STATUS(nfserr);
+}
+
+/*
+ * XDR decode functions
+ */
+static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclargs *args)
+{
+	if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+		return 0;
+	args->mask = ntohl(*p); p++;
+
+	return xdr_argsize_check(rqstp, p);
+}
+
+
+static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_setaclargs *args)
+{
+	struct kvec *head = rqstp->rq_arg.head;
+	unsigned int base;
+	int n;
+
+	if (!(p = nfs3svc_decode_fh(p, &args->fh)))
+		return 0;
+	args->mask = ntohl(*p++);
+	if (args->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT) ||
+	    !xdr_argsize_check(rqstp, p))
+		return 0;
+
+	base = (char *)p - (char *)head->iov_base;
+	n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+			  (args->mask & NFS_ACL) ?
+			  &args->acl_access : NULL);
+	if (n > 0)
+		n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+				  (args->mask & NFS_DFACL) ?
+				  &args->acl_default : NULL);
+	return (n > 0);
+}
+
+/*
+ * XDR encode functions
+ */
+
+/* GETACL */
+static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	struct dentry *dentry = resp->fh.fh_dentry;
+
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+	if (resp->status == 0 && dentry && dentry->d_inode) {
+		struct inode *inode = dentry->d_inode;
+		int w = nfsacl_size(
+			(resp->mask & NFS_ACL)   ? resp->acl_access  : NULL,
+			(resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+		struct kvec *head = rqstp->rq_res.head;
+		unsigned int base;
+		int n;
+
+		*p++ = htonl(resp->mask);
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+		base = (char *)p - (char *)head->iov_base;
+
+		rqstp->rq_res.page_len = w;
+		while (w > 0) {
+			if (!svc_take_res_page(rqstp))
+				return 0;
+			w -= PAGE_SIZE;
+		}
+
+		n = nfsacl_encode(&rqstp->rq_res, base, inode,
+				  resp->acl_access,
+				  resp->mask & NFS_ACL, 0);
+		if (n > 0)
+			n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+					  resp->acl_default,
+					  resp->mask & NFS_DFACL,
+					  NFS_ACL_DEFAULT);
+		if (n <= 0)
+			return 0;
+	} else
+		if (!xdr_ressize_check(rqstp, p))
+			return 0;
+
+	return 1;
+}
+
+/* SETACL */
+static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_attrstat *resp)
+{
+	p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+
+	return xdr_ressize_check(rqstp, p);
+}
+
+/*
+ * XDR release functions
+ */
+static int nfs3svc_release_getacl(struct svc_rqst *rqstp, u32 *p,
+		struct nfsd3_getaclres *resp)
+{
+	fh_put(&resp->fh);
+	posix_acl_release(resp->acl_access);
+	posix_acl_release(resp->acl_default);
+	return 1;
+}
+
+#define nfs3svc_decode_voidargs		NULL
+#define nfs3svc_release_void		NULL
+#define nfsd3_setaclres			nfsd3_attrstat
+#define nfsd3_voidres			nfsd3_voidargs
+struct nfsd3_voidargs { int dummy; };
+
+#define PROC(name, argt, rest, relt, cache, respsize)	\
+ { (svc_procfunc) nfsd3_proc_##name,		\
+   (kxdrproc_t) nfs3svc_decode_##argt##args,	\
+   (kxdrproc_t) nfs3svc_encode_##rest##res,	\
+   (kxdrproc_t) nfs3svc_release_##relt,		\
+   sizeof(struct nfsd3_##argt##args),		\
+   sizeof(struct nfsd3_##rest##res),		\
+   0,						\
+   cache,					\
+   respsize,					\
+ }
+
+#define ST 1		/* status*/
+#define AT 21		/* attributes */
+#define pAT (1+AT)	/* post attributes - conditional */
+#define ACL (1+NFS_ACL_MAX_ENTRIES*3)  /* Access Control List */
+
+static struct svc_procedure		nfsd_acl_procedures3[] = {
+  PROC(null,	void,		void,		void,	  RC_NOCACHE, ST),
+  PROC(getacl,	getacl,		getacl,		getacl,	  RC_NOCACHE, ST+1+2*(1+ACL)),
+  PROC(setacl,	setacl,		setacl,		fhandle,  RC_NOCACHE, ST+pAT),
+};
+
+struct svc_version	nfsd_acl_version3 = {
+		.vs_vers	= 3,
+		.vs_nproc	= 3,
+		.vs_proc	= nfsd_acl_procedures3,
+		.vs_dispatch	= nfsd_dispatch,
+		.vs_xdrsize	= NFS3_SVC_XDRSIZE,
+};
+
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 11f806835c5a..e0e134d6baba 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -71,6 +71,12 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 	return p + XDR_QUADLEN(size);
 }
 
+/* Helper function for NFSv3 ACL code */
+u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
 static inline u32 *
 encode_fh(u32 *p, struct svc_fh *fhp)
 {
@@ -233,6 +239,13 @@ encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return p;
 }
 
+/* Helper for NFSv3 ACLs */
+u32 *
+nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+{
+	return encode_post_op_attr(rqstp, p, fhp);
+}
+
 /*
  * Enocde weak cache consistency data
  */
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 02ded7cfbdcf..79b25b19fec8 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -31,6 +31,7 @@
 #include <linux/nfsd/stats.h>
 #include <linux/nfsd/cache.h>
 #include <linux/lockd/bind.h>
+#include <linux/nfsacl.h>
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
@@ -362,6 +363,31 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 	return 1;
 }
 
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+static struct svc_stat	nfsd_acl_svcstats;
+static struct svc_version *	nfsd_acl_version[] = {
+	[2] = &nfsd_acl_version2,
+	[3] = &nfsd_acl_version3,
+};
+
+#define NFSD_ACL_NRVERS		(sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0]))
+static struct svc_program	nfsd_acl_program = {
+	.pg_prog		= NFS_ACL_PROGRAM,
+	.pg_nvers		= NFSD_ACL_NRVERS,
+	.pg_vers		= nfsd_acl_version,
+	.pg_name		= "nfsd",
+	.pg_stats		= &nfsd_acl_svcstats,
+};
+
+static struct svc_stat	nfsd_acl_svcstats = {
+	.program	= &nfsd_acl_program,
+};
+
+#define nfsd_acl_program_p	&nfsd_acl_program
+#else
+#define nfsd_acl_program_p	NULL
+#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
 extern struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
 
 static struct svc_version *	nfsd_version[] = {
@@ -376,6 +402,7 @@ static struct svc_version *	nfsd_version[] = {
 
 #define NFSD_NRVERS		(sizeof(nfsd_version)/sizeof(nfsd_version[0]))
 struct svc_program		nfsd_program = {
+	.pg_next		= nfsd_acl_program_p,
 	.pg_prog		= NFS_PROGRAM,		/* program number */
 	.pg_nvers		= NFSD_NRVERS,		/* nr of entries in nfsd_version */
 	.pg_vers		= nfsd_version,		/* version table */
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 948b08287c99..b45999ff33e6 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -49,6 +49,12 @@ decode_fh(u32 *p, struct svc_fh *fhp)
 	return p + (NFS_FHSIZE >> 2);
 }
 
+/* Helper function for NFSv2 ACL code */
+u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp)
+{
+	return decode_fh(p, fhp);
+}
+
 static inline u32 *
 encode_fh(u32 *p, struct svc_fh *fhp)
 {
@@ -190,6 +196,11 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return p;
 }
 
+/* Helper function for NFSv2 ACL code */
+u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
+{
+	return encode_fattr(rqstp, p, fhp);
+}
 
 /*
  * XDR decode functions
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e3e9d217236e..ae3940dc85cc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -46,8 +46,9 @@
 #include <linux/nfsd/nfsfh.h>
 #include <linux/quotaops.h>
 #include <linux/dnotify.h>
-#ifdef CONFIG_NFSD_V4
+#include <linux/xattr_acl.h>
 #include <linux/posix_acl.h>
+#ifdef CONFIG_NFSD_V4
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr_acl.h>
 #include <linux/xattr.h>
@@ -1857,3 +1858,107 @@ nfsd_racache_init(int cache_size)
 	nfsdstats.ra_size = cache_size;
 	return 0;
 }
+
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+struct posix_acl *
+nfsd_get_posix_acl(struct svc_fh *fhp, int type)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	char *name;
+	void *value = NULL;
+	ssize_t size;
+	struct posix_acl *acl;
+
+	if (!IS_POSIXACL(inode) || !inode->i_op || !inode->i_op->getxattr)
+		return ERR_PTR(-EOPNOTSUPP);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name = XATTR_NAME_ACL_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			name = XATTR_NAME_ACL_DEFAULT;
+			break;
+		default:
+			return ERR_PTR(-EOPNOTSUPP);
+	}
+
+	size = inode->i_op->getxattr(fhp->fh_dentry, name, NULL, 0);
+
+	if (size < 0) {
+		acl = ERR_PTR(size);
+		goto getout;
+	} else if (size > 0) {
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value) {
+			acl = ERR_PTR(-ENOMEM);
+			goto getout;
+		}
+		size = inode->i_op->getxattr(fhp->fh_dentry, name, value, size);
+		if (size < 0) {
+			acl = ERR_PTR(size);
+			goto getout;
+		}
+	}
+	acl = posix_acl_from_xattr(value, size);
+
+getout:
+	kfree(value);
+	return acl;
+}
+
+int
+nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
+{
+	struct inode *inode = fhp->fh_dentry->d_inode;
+	char *name;
+	void *value = NULL;
+	size_t size;
+	int error;
+
+	if (!IS_POSIXACL(inode) || !inode->i_op ||
+	    !inode->i_op->setxattr || !inode->i_op->removexattr)
+		return -EOPNOTSUPP;
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			name = XATTR_NAME_ACL_ACCESS;
+			break;
+		case ACL_TYPE_DEFAULT:
+			name = XATTR_NAME_ACL_DEFAULT;
+			break;
+		default:
+			return -EOPNOTSUPP;
+	}
+
+	if (acl && acl->a_count) {
+		size = xattr_acl_size(acl->a_count);
+		value = kmalloc(size, GFP_KERNEL);
+		if (!value)
+			return -ENOMEM;
+		size = posix_acl_to_xattr(acl, value, size);
+		if (size < 0) {
+			error = size;
+			goto getout;
+		}
+	} else
+		size = 0;
+
+	if (!fhp->fh_locked)
+		fh_lock(fhp);  /* unlocking is done automatically */
+	if (size)
+		error = inode->i_op->setxattr(fhp->fh_dentry, name,
+					      value, size, 0);
+	else {
+		if (!S_ISDIR(inode->i_mode) && type == ACL_TYPE_DEFAULT)
+			error = 0;
+		else {
+			error = inode->i_op->removexattr(fhp->fh_dentry, name);
+			if (error == -ENODATA)
+				error = 0;
+		}
+	}
+
+getout:
+	kfree(value);
+	return error;
+}
+#endif  /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
new file mode 100644
index 000000000000..54487a99beb8
--- /dev/null
+++ b/include/linux/nfsacl.h
@@ -0,0 +1,58 @@
+/*
+ * File: linux/nfsacl.h
+ *
+ * (C) 2003 Andreas Gruenbacher <agruen@suse.de>
+ */
+#ifndef __LINUX_NFSACL_H
+#define __LINUX_NFSACL_H
+
+#define NFS_ACL_PROGRAM	100227
+
+#define ACLPROC2_GETACL		1
+#define ACLPROC2_SETACL		2
+#define ACLPROC2_GETATTR	3
+#define ACLPROC2_ACCESS		4
+
+#define ACLPROC3_GETACL		1
+#define ACLPROC3_SETACL		2
+
+
+/* Flags for the getacl/setacl mode */
+#define NFS_ACL			0x0001
+#define NFS_ACLCNT		0x0002
+#define NFS_DFACL		0x0004
+#define NFS_DFACLCNT		0x0008
+
+/* Flag for Default ACL entries */
+#define NFS_ACL_DEFAULT		0x1000
+
+#ifdef __KERNEL__
+
+#include <linux/posix_acl.h>
+
+/* Maximum number of ACL entries over NFS */
+#define NFS_ACL_MAX_ENTRIES	1024
+
+#define NFSACL_MAXWORDS		(2*(2+3*NFS_ACL_MAX_ENTRIES))
+#define NFSACL_MAXPAGES		((2*(8+12*NFS_ACL_MAX_ENTRIES) + PAGE_SIZE-1) \
+				 >> PAGE_SHIFT)
+
+static inline unsigned int
+nfsacl_size(struct posix_acl *acl_access, struct posix_acl *acl_default)
+{
+	unsigned int w = 16;
+	w += max(acl_access ? (int)acl_access->a_count : 3, 4) * 12;
+	if (acl_default)
+		w += max((int)acl_default->a_count, 4) * 12;
+	return w;
+}
+
+extern unsigned int
+nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+	      struct posix_acl *acl, int encode_entries, int typeflag);
+extern unsigned int
+nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+	      struct posix_acl **pacl);
+
+#endif /* __KERNEL__ */
+#endif  /* __LINUX_NFSACL_H */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 8f85d9a59607..4bf931d5ff56 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -15,6 +15,7 @@
 #include <linux/unistd.h>
 #include <linux/dirent.h>
 #include <linux/fs.h>
+#include <linux/posix_acl.h>
 #include <linux/mount.h>
 
 #include <linux/nfsd/debug.h>
@@ -124,6 +125,21 @@ int		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 int		nfsd_notify_change(struct inode *, struct iattr *);
 int		nfsd_permission(struct svc_export *, struct dentry *, int);
 
+#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+#ifdef CONFIG_NFSD_V2_ACL
+extern struct svc_version nfsd_acl_version2;
+#else
+#define nfsd_acl_version2 NULL
+#endif
+#ifdef CONFIG_NFSD_V3_ACL
+extern struct svc_version nfsd_acl_version3;
+#else
+#define nfsd_acl_version3 NULL
+#endif
+struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int);
+int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
+#endif
+
 
 /* 
  * NFSv4 State
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index ecccef777dae..130d4f588a37 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -169,4 +169,8 @@ int nfssvc_encode_entry(struct readdir_cd *, const char *name,
 
 int nfssvc_release_fhandle(struct svc_rqst *, u32 *, struct nfsd_fhandle *);
 
+/* Helper functions for NFSv2 ACL code */
+u32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp);
+u32 *nfs2svc_decode_fh(u32 *p, struct svc_fh *fhp);
+
 #endif /* LINUX_NFSD_H */
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 0ae9e0ef5f68..21e18ce7ca63 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -110,6 +110,19 @@ struct nfsd3_commitargs {
 	__u32			count;
 };
 
+struct nfsd3_getaclargs {
+	struct svc_fh		fh;
+	int			mask;
+};
+
+struct posix_acl;
+struct nfsd3_setaclargs {
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+};
+
 struct nfsd3_attrstat {
 	__u32			status;
 	struct svc_fh		fh;
@@ -209,6 +222,14 @@ struct nfsd3_commitres {
 	struct svc_fh		fh;
 };
 
+struct nfsd3_getaclres {
+	__u32			status;
+	struct svc_fh		fh;
+	int			mask;
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+};
+
 /* dummy type for release */
 struct nfsd3_fhandle_pair {
 	__u32			dummy;
@@ -241,6 +262,7 @@ union nfsd3_xdrstore {
 	struct nfsd3_fsinfores		fsinfores;
 	struct nfsd3_pathconfres	pathconfres;
 	struct nfsd3_commitres		commitres;
+	struct nfsd3_getaclres		getaclres;
 };
 
 #define NFS3_SVC_XDRSIZE		sizeof(union nfsd3_xdrstore)
@@ -316,6 +338,10 @@ int nfs3svc_encode_entry(struct readdir_cd *, const char *name,
 int nfs3svc_encode_entry_plus(struct readdir_cd *, const char *name,
 				int namlen, loff_t offset, ino_t ino,
 				unsigned int);
+/* Helper functions for NFSv3 ACL code */
+u32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, u32 *p,
+				struct svc_fh *fhp);
+u32 *nfs3svc_decode_fh(u32 *p, struct svc_fh *fhp);
 
 
 #endif /* _LINUX_NFSD_XDR3_H */
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index facb94488bb1..5af8800e0ce3 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -185,6 +185,17 @@ xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
 	return vec->iov_len <= PAGE_SIZE;
 }
 
+static inline struct page *
+svc_take_res_page(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_arghi <= rqstp->rq_argused)
+		return NULL;
+	rqstp->rq_arghi--;
+	rqstp->rq_respages[rqstp->rq_resused] =
+		rqstp->rq_argpages[rqstp->rq_arghi];
+	return rqstp->rq_respages[rqstp->rq_resused++];
+}
+
 static inline int svc_take_page(struct svc_rqst *rqstp)
 {
 	if (rqstp->rq_arghi <= rqstp->rq_argused)
-- 
cgit v1.2.3-59-g8ed1b


From b7fa0554cf1ba6d6895cd0a5b02989a26e0bc704 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:27 +0000
Subject: [PATCH] NFS: Add support for NFSv3 ACLs

 This adds acl support fo nfs clients via the NFSACL protocol extension, by
 implementing the getxattr, listxattr, setxattr, and removexattr iops for the
 system.posix_acl_access and system.posix_acl_default attributes.  This patch
 implements a dumb version that uses no caching (and thus adds some overhead).
 (Another patch in this patchset adds caching as well.)

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/Kconfig                |  11 ++
 fs/nfs/Makefile           |   1 +
 fs/nfs/dir.c              |  21 ++++
 fs/nfs/file.c             |  12 ++
 fs/nfs/inode.c            |  36 +++++-
 fs/nfs/nfs3acl.c          | 303 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfs/nfs3proc.c         |   7 +-
 fs/nfs/nfs3xdr.c          | 147 ++++++++++++++++++++++
 fs/nfs/nfsroot.c          |   9 ++
 include/linux/nfs_fs.h    |  31 +++++
 include/linux/nfs_fs_sb.h |   1 +
 include/linux/nfs_mount.h |   1 +
 include/linux/nfs_xdr.h   |  27 +++++
 13 files changed, 601 insertions(+), 6 deletions(-)
 create mode 100644 fs/nfs/nfs3acl.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index d44b04d9b0a9..a7c0cc3203cb 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1268,6 +1268,7 @@ config NFS_FS
 	depends on INET
 	select LOCKD
 	select SUNRPC
+	select NFS_ACL_SUPPORT if NFS_V3_ACL
 	help
 	  If you are connected to some other (usually local) Unix computer
 	  (using SLIP, PLIP, PPP or Ethernet) and want to mount files residing
@@ -1310,6 +1311,16 @@ config NFS_V3
 
 	  If unsure, say Y.
 
+config NFS_V3_ACL
+	bool "Provide client support for the NFSv3 ACL protocol extension"
+	depends on NFS_V3
+	help
+	  Implement the NFSv3 ACL protocol extension for manipulating POSIX
+	  Access Control Lists.  The server should also be compiled with
+	  the NFSv3 ACL protocol extension; see the CONFIG_NFSD_V3_ACL option.
+
+	  If unsure, say N.
+
 config NFS_V4
 	bool "Provide NFSv4 client support (EXPERIMENTAL)"
 	depends on NFS_FS && EXPERIMENTAL
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index b4baa031edf4..8b3bb715d177 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -8,6 +8,7 @@ nfs-y 			:= dir.o file.o inode.o nfs2xdr.o pagelist.o \
 			   proc.o read.o symlink.o unlink.o write.o
 nfs-$(CONFIG_ROOT_NFS)	+= nfsroot.o mount_clnt.o      
 nfs-$(CONFIG_NFS_V3)	+= nfs3proc.o nfs3xdr.o
+nfs-$(CONFIG_NFS_V3_ACL)	+= nfs3acl.o
 nfs-$(CONFIG_NFS_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \
 			   delegation.o idmap.o \
 			   callback.o callback_xdr.o callback_proc.o
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5720537bffdd..2c6a95945684 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -75,6 +75,27 @@ struct inode_operations nfs_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 };
 
+#ifdef CONFIG_NFS_V3
+struct inode_operations nfs3_dir_inode_operations = {
+	.create		= nfs_create,
+	.lookup		= nfs_lookup,
+	.link		= nfs_link,
+	.unlink		= nfs_unlink,
+	.symlink	= nfs_symlink,
+	.mkdir		= nfs_mkdir,
+	.rmdir		= nfs_rmdir,
+	.mknod		= nfs_mknod,
+	.rename		= nfs_rename,
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= nfs3_getxattr,
+	.setxattr	= nfs3_setxattr,
+	.removexattr	= nfs3_removexattr,
+};
+#endif  /* CONFIG_NFS_V3 */
+
 #ifdef CONFIG_NFS_V4
 
 static struct dentry *nfs_atomic_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 55c907592490..a606708264ed 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -71,6 +71,18 @@ struct inode_operations nfs_file_inode_operations = {
 	.setattr	= nfs_setattr,
 };
 
+#ifdef CONFIG_NFS_V3
+struct inode_operations nfs3_file_inode_operations = {
+	.permission	= nfs_permission,
+	.getattr	= nfs_getattr,
+	.setattr	= nfs_setattr,
+	.listxattr	= nfs3_listxattr,
+	.getxattr	= nfs3_getxattr,
+	.setxattr	= nfs3_setxattr,
+	.removexattr	= nfs3_removexattr,
+};
+#endif  /* CONFIG_NFS_v3 */
+
 /* Hack for future NFS swap support */
 #ifndef IS_SWAPFILE
 # define IS_SWAPFILE(inode)	(0)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 97b3fe7ece63..440b9cbb6f81 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -108,6 +108,21 @@ static struct rpc_program	nfs_program = {
 	.pipe_dir_name		= "/nfs",
 };
 
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_stat		nfsacl_rpcstat = { &nfsacl_program };
+static struct rpc_version *	nfsacl_version[] = {
+	[3]			= &nfsacl_version3,
+};
+
+struct rpc_program		nfsacl_program = {
+	.name =			"nfsacl",
+	.number =		NFS_ACL_PROGRAM,
+	.nrvers =		sizeof(nfsacl_version) / sizeof(nfsacl_version[0]),
+	.version =		nfsacl_version,
+	.stats =		&nfsacl_rpcstat,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
+
 static inline unsigned long
 nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 {
@@ -165,6 +180,9 @@ nfs_umount_begin(struct super_block *sb)
 	/* -EIO all pending I/O */
 	if (!IS_ERR(rpc))
 		rpc_killall_tasks(rpc);
+	rpc = NFS_SB(sb)->client_acl;
+	if (!IS_ERR(rpc))
+		rpc_killall_tasks(rpc);
 }
 
 
@@ -461,8 +479,17 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 		atomic_inc(&server->client->cl_count);
 		server->client_sys = server->client;
 	}
-
 	if (server->flags & NFS_MOUNT_VER3) {
+#ifdef CONFIG_NFS_V3_ACL
+		if (!(server->flags & NFS_MOUNT_NOACL)) {
+			server->client_acl = rpc_bind_new_program(server->client, &nfsacl_program, 3);
+			/* No errors! Assume that Sun nfsacls are supported */
+			if (!IS_ERR(server->client_acl))
+				server->caps |= NFS_CAP_ACLS;
+		}
+#else
+		server->flags &= ~NFS_MOUNT_NOACL;
+#endif /* CONFIG_NFS_V3_ACL */
 		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
 			server->namelen = NFS3_MAXNAMLEN;
 		sb->s_time_gran = 1;
@@ -546,6 +573,7 @@ static int nfs_show_options(struct seq_file *m, struct vfsmount *mnt)
 		{ NFS_MOUNT_NOCTO, ",nocto", "" },
 		{ NFS_MOUNT_NOAC, ",noac", "" },
 		{ NFS_MOUNT_NONLM, ",nolock", ",lock" },
+		{ NFS_MOUNT_NOACL, ",noacl", "" },
 		{ 0, NULL, NULL }
 	};
 	struct proc_nfs_info *nfs_infop;
@@ -1452,7 +1480,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type,
 	memset(server, 0, sizeof(struct nfs_server));
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
-	server->client = server->client_sys = ERR_PTR(-EINVAL);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
 
 	root = &server->fh;
 	if (data->flags & NFS_MOUNT_VER3)
@@ -1513,6 +1541,8 @@ static void nfs_kill_super(struct super_block *s)
 		rpc_shutdown_client(server->client);
 	if (!IS_ERR(server->client_sys))
 		rpc_shutdown_client(server->client_sys);
+	if (!IS_ERR(server->client_acl))
+		rpc_shutdown_client(server->client_acl);
 
 	if (!(server->flags & NFS_MOUNT_NONLM))
 		lockd_down();	/* release rpc.lockd */
@@ -1794,7 +1824,7 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type,
 	memset(server, 0, sizeof(struct nfs_server));
 	/* Zero out the NFS state stuff */
 	init_nfsv4_state(server);
-	server->client = server->client_sys = ERR_PTR(-EINVAL);
+	server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL);
 
 	p = nfs_copy_user_string(NULL, &data->hostname, 256);
 	if (IS_ERR(p))
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
new file mode 100644
index 000000000000..393ba79fc14f
--- /dev/null
+++ b/fs/nfs/nfs3acl.c
@@ -0,0 +1,303 @@
+#include <linux/fs.h>
+#include <linux/nfs.h>
+#include <linux/nfs3.h>
+#include <linux/nfs_fs.h>
+#include <linux/xattr_acl.h>
+#include <linux/nfsacl.h>
+
+#define NFSDBG_FACILITY	NFSDBG_PROC
+
+ssize_t nfs3_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int pos=0, len=0;
+
+#	define output(s) do {						\
+			if (pos + sizeof(s) <= size) {			\
+				memcpy(buffer + pos, s, sizeof(s));	\
+				pos += sizeof(s);			\
+			}						\
+			len += sizeof(s);				\
+		} while(0)
+
+	acl = nfs3_proc_getacl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl) {
+		output("system.posix_acl_access");
+		posix_acl_release(acl);
+	}
+
+	if (S_ISDIR(inode->i_mode)) {
+		acl = nfs3_proc_getacl(inode, ACL_TYPE_DEFAULT);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			output("system.posix_acl_default");
+			posix_acl_release(acl);
+		}
+	}
+
+#	undef output
+
+	if (!buffer || len <= size)
+		return len;
+	return -ERANGE;
+}
+
+ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
+		void *buffer, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int type, error = 0;
+
+	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	acl = nfs3_proc_getacl(inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	else if (acl) {
+		if (type == ACL_TYPE_ACCESS && acl->a_count == 0)
+			error = -ENODATA;
+		else
+			error = posix_acl_to_xattr(acl, buffer, size);
+		posix_acl_release(acl);
+	} else
+		error = -ENODATA;
+
+	return error;
+}
+
+int nfs3_setxattr(struct dentry *dentry, const char *name,
+	     const void *value, size_t size, int flags)
+{
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl;
+	int type, error;
+
+	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	acl = posix_acl_from_xattr(value, size);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	error = nfs3_proc_setacl(inode, type, acl);
+	posix_acl_release(acl);
+
+	return error;
+}
+
+int nfs3_removexattr(struct dentry *dentry, const char *name)
+{
+	struct inode *inode = dentry->d_inode;
+	int type;
+
+	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+		type = ACL_TYPE_ACCESS;
+	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+		type = ACL_TYPE_DEFAULT;
+	else
+		return -EOPNOTSUPP;
+
+	return nfs3_proc_setacl(inode, type, NULL);
+}
+
+struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
+	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct nfs3_getaclargs args = {
+		.fh = NFS_FH(inode),
+		/* The xdr layer may allocate pages here. */
+		.pages = pages,
+	};
+	struct nfs3_getaclres res = {
+		.fattr =	&fattr,
+	};
+	struct posix_acl *acl = NULL;
+	int status, count;
+
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	switch (type) {
+		case ACL_TYPE_ACCESS:
+			args.mask = NFS_ACLCNT|NFS_ACL;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			if (!S_ISDIR(inode->i_mode))
+				return NULL;
+			args.mask = NFS_DFACLCNT|NFS_DFACL;
+			break;
+
+		default:
+			return ERR_PTR(-EINVAL);
+	}
+
+	dprintk("NFS call getacl\n");
+	status = rpc_call(server->client_acl, ACLPROC3_GETACL,
+			  &args, &res, 0);
+	dprintk("NFS reply getacl: %d\n", status);
+
+	/* pages may have been allocated at the xdr layer. */
+	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+		__free_page(args.pages[count]);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, &fattr);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL extension not supported; disabling\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+		default:
+			goto getout;
+	}
+	if ((args.mask & res.mask) != args.mask) {
+		status = -EIO;
+		goto getout;
+	}
+
+	if (res.acl_access != NULL) {
+		if (posix_acl_equiv_mode(res.acl_access, NULL) == 0) {
+			posix_acl_release(res.acl_access);
+			res.acl_access = NULL;
+		}
+	}
+
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = res.acl_access;
+			res.acl_access = NULL;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = res.acl_default;
+			res.acl_default = NULL;
+	}
+
+getout:
+	posix_acl_release(res.acl_access);
+	posix_acl_release(res.acl_default);
+
+	if (status != 0) {
+		posix_acl_release(acl);
+		acl = ERR_PTR(status);
+	}
+	return acl;
+}
+
+static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
+		  struct posix_acl *dfacl)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs_fattr fattr;
+	struct page *pages[NFSACL_MAXPAGES] = { };
+	struct nfs3_setaclargs args = {
+		.inode = inode,
+		.mask = NFS_ACL,
+		.acl_access = acl,
+		.pages = pages,
+	};
+	int status, count;
+
+	status = -EOPNOTSUPP;
+	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
+		goto out;
+
+	/* We are doing this here, because XDR marshalling can only
+	   return -ENOMEM. */
+	status = -ENOSPC;
+	if (acl != NULL && acl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (dfacl != NULL && dfacl->a_count > NFS_ACL_MAX_ENTRIES)
+		goto out;
+	if (S_ISDIR(inode->i_mode)) {
+		args.mask |= NFS_DFACL;
+		args.acl_default = dfacl;
+	}
+
+	dprintk("NFS call setacl\n");
+	nfs_begin_data_update(inode);
+	status = rpc_call(server->client_acl, ACLPROC3_SETACL,
+			  &args, &fattr, 0);
+	NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS;
+	nfs_end_data_update(inode);
+	dprintk("NFS reply setacl: %d\n", status);
+
+	/* pages may have been allocated at the xdr layer. */
+	for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++)
+		__free_page(args.pages[count]);
+
+	switch (status) {
+		case 0:
+			status = nfs_refresh_inode(inode, &fattr);
+			break;
+		case -EPFNOSUPPORT:
+		case -EPROTONOSUPPORT:
+			dprintk("NFS_V3_ACL SETACL RPC not supported"
+					"(will not retry)\n");
+			server->caps &= ~NFS_CAP_ACLS;
+		case -ENOTSUPP:
+			status = -EOPNOTSUPP;
+	}
+out:
+	return status;
+}
+
+int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
+{
+	struct posix_acl *alloc = NULL, *dfacl = NULL;
+	int status;
+
+	if (S_ISDIR(inode->i_mode)) {
+		switch(type) {
+			case ACL_TYPE_ACCESS:
+				alloc = dfacl = nfs3_proc_getacl(inode,
+						ACL_TYPE_DEFAULT);
+				if (IS_ERR(alloc))
+					goto fail;
+				break;
+
+			case ACL_TYPE_DEFAULT:
+				dfacl = acl;
+				alloc = acl = nfs3_proc_getacl(inode,
+						ACL_TYPE_ACCESS);
+				if (IS_ERR(alloc))
+					goto fail;
+				break;
+
+			default:
+				return -EINVAL;
+		}
+	} else if (type != ACL_TYPE_ACCESS)
+			return -EINVAL;
+
+	if (acl == NULL) {
+		alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+		if (IS_ERR(alloc))
+			goto fail;
+	}
+	status = nfs3_proc_setacls(inode, acl, dfacl);
+	posix_acl_release(alloc);
+	return status;
+
+fail:
+	return PTR_ERR(alloc);
+}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 53953a775714..d03bac0cc42f 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -17,6 +17,7 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/smp_lock.h>
+#include <linux/nfs_mount.h>
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
 
@@ -45,7 +46,7 @@ static inline int
 nfs3_rpc_call_wrapper(struct rpc_clnt *clnt, u32 proc, void *argp, void *resp, int flags)
 {
 	struct rpc_message msg = {
-		.rpc_proc	= &nfs3_procedures[proc],
+		.rpc_proc	= &clnt->cl_procinfo[proc],
 		.rpc_argp	= argp,
 		.rpc_resp	= resp,
 	};
@@ -825,8 +826,8 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 struct nfs_rpc_ops	nfs_v3_clientops = {
 	.version	= 3,			/* protocol version */
 	.dentry_ops	= &nfs_dentry_operations,
-	.dir_inode_ops	= &nfs_dir_inode_operations,
-	.file_inode_ops	= &nfs_file_inode_operations,
+	.dir_inode_ops	= &nfs3_dir_inode_operations,
+	.file_inode_ops	= &nfs3_file_inode_operations,
 	.getroot	= nfs3_proc_get_root,
 	.getattr	= nfs3_proc_getattr,
 	.setattr	= nfs3_proc_setattr,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index a3593d47e5ab..a4437fb177f0 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -21,6 +21,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
+#include <linux/nfsacl.h>
 
 #define NFSDBG_FACILITY		NFSDBG_XDR
 
@@ -79,6 +80,11 @@ extern int			nfs_stat_to_errno(int);
 #define NFS3_pathconfres_sz	(1+NFS3_post_op_attr_sz+6)
 #define NFS3_commitres_sz	(1+NFS3_wcc_data_sz+2)
 
+#define ACL3_getaclargs_sz	(NFS3_fh_sz+1)
+#define ACL3_setaclargs_sz	(NFS3_fh_sz+1+2*(2+5*3))
+#define ACL3_getaclres_sz	(1+NFS3_post_op_attr_sz+1+2*(2+5*3))
+#define ACL3_setaclres_sz	(1+NFS3_post_op_attr_sz)
+
 /*
  * Map file type to S_IFMT bits
  */
@@ -627,6 +633,74 @@ nfs3_xdr_commitargs(struct rpc_rqst *req, u32 *p, struct nfs_writeargs *args)
 	return 0;
 }
 
+#ifdef CONFIG_NFS_V3_ACL
+/*
+ * Encode GETACL arguments
+ */
+static int
+nfs3_xdr_getaclargs(struct rpc_rqst *req, u32 *p,
+		    struct nfs3_getaclargs *args)
+{
+	struct rpc_auth *auth = req->rq_task->tk_auth;
+	unsigned int replen;
+
+	p = xdr_encode_fhandle(p, args->fh);
+	*p++ = htonl(args->mask);
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p);
+
+	if (args->mask & (NFS_ACL | NFS_DFACL)) {
+		/* Inline the page array */
+		replen = (RPC_REPHDRSIZE + auth->au_rslack +
+			  ACL3_getaclres_sz) << 2;
+		xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages, 0,
+				 NFSACL_MAXPAGES << PAGE_SHIFT);
+	}
+	return 0;
+}
+
+/*
+ * Encode SETACL arguments
+ */
+static int
+nfs3_xdr_setaclargs(struct rpc_rqst *req, u32 *p,
+                   struct nfs3_setaclargs *args)
+{
+	struct xdr_buf *buf = &req->rq_snd_buf;
+	unsigned int base, len_in_head, len = nfsacl_size(
+		(args->mask & NFS_ACL)   ? args->acl_access  : NULL,
+		(args->mask & NFS_DFACL) ? args->acl_default : NULL);
+	int count, err;
+
+	p = xdr_encode_fhandle(p, NFS_FH(args->inode));
+	*p++ = htonl(args->mask);
+	base = (char *)p - (char *)buf->head->iov_base;
+	/* put as much of the acls into head as possible. */
+	len_in_head = min_t(unsigned int, buf->head->iov_len - base, len);
+	len -= len_in_head;
+	req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + len_in_head);
+
+	for (count = 0; (count << PAGE_SHIFT) < len; count++) {
+		args->pages[count] = alloc_page(GFP_KERNEL);
+		if (!args->pages[count]) {
+			while (count)
+				__free_page(args->pages[--count]);
+			return -ENOMEM;
+		}
+	}
+	xdr_encode_pages(buf, args->pages, 0, len);
+
+	err = nfsacl_encode(buf, base, args->inode,
+			    (args->mask & NFS_ACL) ?
+			    args->acl_access : NULL, 1, 0);
+	if (err > 0)
+		err = nfsacl_encode(buf, base + err, args->inode,
+				    (args->mask & NFS_DFACL) ?
+				    args->acl_default : NULL, 1,
+				    NFS_ACL_DEFAULT);
+	return (err > 0) ? 0 : err;
+}
+#endif  /* CONFIG_NFS_V3_ACL */
+
 /*
  * NFS XDR decode functions
  */
@@ -978,6 +1052,54 @@ nfs3_xdr_commitres(struct rpc_rqst *req, u32 *p, struct nfs_writeres *res)
 	return 0;
 }
 
+#ifdef CONFIG_NFS_V3_ACL
+/*
+ * Decode GETACL reply
+ */
+static int
+nfs3_xdr_getaclres(struct rpc_rqst *req, u32 *p,
+		   struct nfs3_getaclres *res)
+{
+	struct xdr_buf *buf = &req->rq_rcv_buf;
+	int status = ntohl(*p++);
+	struct posix_acl **acl;
+	unsigned int *aclcnt;
+	int err, base;
+
+	if (status != 0)
+		return -nfs_stat_to_errno(status);
+	p = xdr_decode_post_op_attr(p, res->fattr);
+	res->mask = ntohl(*p++);
+	if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT))
+		return -EINVAL;
+	base = (char *)p - (char *)req->rq_rcv_buf.head->iov_base;
+
+	acl = (res->mask & NFS_ACL) ? &res->acl_access : NULL;
+	aclcnt = (res->mask & NFS_ACLCNT) ? &res->acl_access_count : NULL;
+	err = nfsacl_decode(buf, base, aclcnt, acl);
+
+	acl = (res->mask & NFS_DFACL) ? &res->acl_default : NULL;
+	aclcnt = (res->mask & NFS_DFACLCNT) ? &res->acl_default_count : NULL;
+	if (err > 0)
+		err = nfsacl_decode(buf, base + err, aclcnt, acl);
+	return (err > 0) ? 0 : err;
+}
+
+/*
+ * Decode setacl reply.
+ */
+static int
+nfs3_xdr_setaclres(struct rpc_rqst *req, u32 *p, struct nfs_fattr *fattr)
+{
+	int status = ntohl(*p++);
+
+	if (status)
+		return -nfs_stat_to_errno(status);
+	xdr_decode_post_op_attr(p, fattr);
+	return 0;
+}
+#endif  /* CONFIG_NFS_V3_ACL */
+
 #ifndef MAX
 # define MAX(a, b)	(((a) > (b))? (a) : (b))
 #endif
@@ -1021,3 +1143,28 @@ struct rpc_version		nfs_version3 = {
 	.procs			= nfs3_procedures
 };
 
+#ifdef CONFIG_NFS_V3_ACL
+static struct rpc_procinfo	nfs3_acl_procedures[] = {
+	[ACLPROC3_GETACL] = {
+		.p_proc = ACLPROC3_GETACL,
+		.p_encode = (kxdrproc_t) nfs3_xdr_getaclargs,
+		.p_decode = (kxdrproc_t) nfs3_xdr_getaclres,
+		.p_bufsiz = MAX(ACL3_getaclargs_sz, ACL3_getaclres_sz) << 2,
+		.p_timer = 1,
+	},
+	[ACLPROC3_SETACL] = {
+		.p_proc = ACLPROC3_SETACL,
+		.p_encode = (kxdrproc_t) nfs3_xdr_setaclargs,
+		.p_decode = (kxdrproc_t) nfs3_xdr_setaclres,
+		.p_bufsiz = MAX(ACL3_setaclargs_sz, ACL3_setaclres_sz) << 2,
+		.p_timer = 0,
+	},
+};
+
+struct rpc_version		nfsacl_version3 = {
+	.number			= 3,
+	.nrprocs		= sizeof(nfs3_acl_procedures)/
+				  sizeof(nfs3_acl_procedures[0]),
+	.procs			= nfs3_acl_procedures,
+};
+#endif  /* CONFIG_NFS_V3_ACL */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index fd5bc596fe8a..1b272a135a31 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -124,6 +124,7 @@ enum {
 	Opt_soft, Opt_hard, Opt_intr,
 	Opt_nointr, Opt_posix, Opt_noposix, Opt_cto, Opt_nocto, Opt_ac, 
 	Opt_noac, Opt_lock, Opt_nolock, Opt_v2, Opt_v3, Opt_udp, Opt_tcp,
+	Opt_acl, Opt_noacl,
 	/* Error token */
 	Opt_err
 };
@@ -158,6 +159,8 @@ static match_table_t __initdata tokens = {
 	{Opt_udp, "udp"},
 	{Opt_tcp, "proto=tcp"},
 	{Opt_tcp, "tcp"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
 	{Opt_err, NULL}
 	
 };
@@ -266,6 +269,12 @@ static int __init root_nfs_parse(char *name, char *buf)
 			case Opt_tcp:
 				nfs_data.flags |= NFS_MOUNT_TCP;
 				break;
+			case Opt_acl:
+				nfs_data.flags &= ~NFS_MOUNT_NOACL;
+				break;
+			case Opt_noacl:
+				nfs_data.flags |= NFS_MOUNT_NOACL;
+				break;
 			default : 
 				return 0;
 		}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index d2b5d7e0e85a..3a5e442ac776 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -301,6 +301,9 @@ extern u32 root_nfs_parse_addr(char *name); /*__init*/
  * linux/fs/nfs/file.c
  */
 extern struct inode_operations nfs_file_inode_operations;
+#ifdef CONFIG_NFS_V3
+extern struct inode_operations nfs3_file_inode_operations;
+#endif /* CONFIG_NFS_V3 */
 extern struct file_operations nfs_file_operations;
 extern struct address_space_operations nfs_file_aops;
 
@@ -315,6 +318,22 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file)
 	return NULL;
 }
 
+/*
+ * linux/fs/nfs/xattr.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t);
+extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t);
+extern int nfs3_setxattr(struct dentry *, const char *,
+			const void *, size_t, int);
+extern int nfs3_removexattr (struct dentry *, const char *name);
+#else
+# define nfs3_listxattr NULL
+# define nfs3_getxattr NULL
+# define nfs3_setxattr NULL
+# define nfs3_removexattr NULL
+#endif
+
 /*
  * linux/fs/nfs/direct.c
  */
@@ -329,6 +348,9 @@ extern ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf,
  * linux/fs/nfs/dir.c
  */
 extern struct inode_operations nfs_dir_inode_operations;
+#ifdef CONFIG_NFS_V3
+extern struct inode_operations nfs3_dir_inode_operations;
+#endif /* CONFIG_NFS_V3 */
 extern struct file_operations nfs_dir_operations;
 extern struct dentry_operations nfs_dentry_operations;
 
@@ -449,6 +471,15 @@ static inline void nfs_readdata_free(struct nfs_read_data *p)
 
 extern void  nfs_readdata_release(struct rpc_task *task);
 
+/*
+ * linux/fs/nfs3proc.c
+ */
+#ifdef CONFIG_NFS_V3_ACL
+extern struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type);
+extern int nfs3_proc_setacl(struct inode *inode, int type,
+			    struct posix_acl *acl);
+#endif /* CONFIG_NFS_V3_ACL */
+
 /*
  * linux/fs/mount_clnt.c
  * (Used only by nfsroot module)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index fc51645d61ee..3d3a305488cf 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -10,6 +10,7 @@
 struct nfs_server {
 	struct rpc_clnt *	client;		/* RPC client handle */
 	struct rpc_clnt *	client_sys;	/* 2nd handle for FSINFO */
+	struct rpc_clnt *	client_acl;	/* ACL RPC client handle */
 	struct nfs_rpc_ops *	rpc_ops;	/* NFS protocol vector */
 	struct backing_dev_info	backing_dev_info;
 	int			flags;		/* various flags */
diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h
index 0071428231f9..659c75438454 100644
--- a/include/linux/nfs_mount.h
+++ b/include/linux/nfs_mount.h
@@ -58,6 +58,7 @@ struct nfs_mount_data {
 #define NFS_MOUNT_KERBEROS	0x0100	/* 3 */
 #define NFS_MOUNT_NONLM		0x0200	/* 3 */
 #define NFS_MOUNT_BROKEN_SUID	0x0400	/* 4 */
+#define NFS_MOUNT_NOACL		0x0800	/* 4 */
 #define NFS_MOUNT_STRICTLOCK	0x1000	/* reserved for NFSv4 */
 #define NFS_MOUNT_SECFLAVOUR	0x2000	/* 5 */
 #define NFS_MOUNT_FLAGMASK	0xFFFF
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 46b206b460c0..a2bf6914ff1b 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -2,6 +2,7 @@
 #define _LINUX_NFS_XDR_H
 
 #include <linux/sunrpc/xprt.h>
+#include <linux/nfsacl.h>
 
 struct nfs4_fsid {
 	__u64 major;
@@ -368,6 +369,20 @@ struct nfs_readdirargs {
 	struct page **		pages;
 };
 
+struct nfs3_getaclargs {
+	struct nfs_fh *		fh;
+	int			mask;
+	struct page **		pages;
+};
+
+struct nfs3_setaclargs {
+	struct inode *		inode;
+	int			mask;
+	struct posix_acl *	acl_access;
+	struct posix_acl *	acl_default;
+	struct page **		pages;
+};
+
 struct nfs_diropok {
 	struct nfs_fh *		fh;
 	struct nfs_fattr *	fattr;
@@ -491,6 +506,15 @@ struct nfs3_readdirres {
 	int			plus;
 };
 
+struct nfs3_getaclres {
+	struct nfs_fattr *	fattr;
+	int			mask;
+	unsigned int		acl_access_count;
+	unsigned int		acl_default_count;
+	struct posix_acl *	acl_access;
+	struct posix_acl *	acl_default;
+};
+
 #ifdef CONFIG_NFS_V4
 
 typedef u64 clientid4;
@@ -748,4 +772,7 @@ extern struct rpc_version	nfs_version2;
 extern struct rpc_version	nfs_version3;
 extern struct rpc_version	nfs_version4;
 
+extern struct rpc_version	nfsacl_version3;
+extern struct rpc_program	nfsacl_program;
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 055ffbea0596942579b0dae71d5dab78de8135f6 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:27 +0000
Subject: [PATCH] NFS: Fix handling of the umask when an NFSv3 default acl is
 present.

 NFSv3 has no concept of a umask on the server side: The client applies
 the umask locally, and sends the effective permissions to the server.
 This behavior is wrong when files are created in a directory that has a
 default ACL.  In this case, the umask is supposed to be ignored, and
 only the default ACL determines the file's effective permissions.

 Usually its the server's task to conditionally apply the umask.  But
 since the server knows nothing about the umask, we have to do it on the
 client side.  This patch tries to fetch the parent directory's default
 ACL before creating a new file, computes the appropriate create mode to
 send to the server, and finally sets the new file's access and default
 acl appropriately.

 Many thanks to Buck Huppmann <buchk@pobox.com> for sending the initial
 version of this patch, as well as for arguing why we need this change.

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  5 +++++
 fs/nfs/nfs3acl.c       | 29 +++++++++++++++++++++++++++++
 fs/nfs/nfs3proc.c      | 36 ++++++++++++++++++++++++++++++------
 include/linux/nfs_fs.h |  9 +++++++++
 4 files changed, 73 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 440b9cbb6f81..50a03f1504a1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -490,6 +490,11 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent)
 #else
 		server->flags &= ~NFS_MOUNT_NOACL;
 #endif /* CONFIG_NFS_V3_ACL */
+		/*
+		 * The VFS shouldn't apply the umask to mode bits. We will
+		 * do so ourselves when necessary.
+		 */
+		sb->s_flags |= MS_POSIXACL;
 		if (server->namelen == 0 || server->namelen > NFS3_MAXNAMLEN)
 			server->namelen = NFS3_MAXNAMLEN;
 		sb->s_time_gran = 1;
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 393ba79fc14f..89b6468700e7 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -301,3 +301,32 @@ int nfs3_proc_setacl(struct inode *inode, int type, struct posix_acl *acl)
 fail:
 	return PTR_ERR(alloc);
 }
+
+int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
+		mode_t mode)
+{
+	struct posix_acl *dfacl, *acl;
+	int error = 0;
+
+	dfacl = nfs3_proc_getacl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(dfacl)) {
+		error = PTR_ERR(dfacl);
+		return (error == -EOPNOTSUPP) ? 0 : error;
+	}
+	if (!dfacl)
+		return 0;
+	acl = posix_acl_clone(dfacl, GFP_KERNEL);
+	error = -ENOMEM;
+	if (!acl)
+		goto out_release_dfacl;
+	error = posix_acl_create_masq(acl, &mode);
+	if (error < 0)
+		goto out_release_acl;
+	error = nfs3_proc_setacls(inode, acl, S_ISDIR(inode->i_mode) ?
+						      dfacl : NULL);
+out_release_acl:
+	posix_acl_release(acl);
+out_release_dfacl:
+	posix_acl_release(dfacl);
+	return error;
+}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d03bac0cc42f..a9ddc196224d 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -314,7 +314,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
-	int			status;
+	mode_t mode = sattr->ia_mode;
+	int status;
 
 	dprintk("NFS call  create %s\n", dentry->d_name.name);
 	arg.createmode = NFS3_CREATE_UNCHECKED;
@@ -324,6 +325,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		arg.verifier[1] = current->pid;
 	}
 
+	sattr->ia_mode &= ~current->fs->umask;
+
 again:
 	dir_attr.valid = 0;
 	fattr.valid = 0;
@@ -370,6 +373,9 @@ again:
 		nfs_refresh_inode(dentry->d_inode, &fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
 	}
+	if (status != 0)
+		goto out;
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
 out:
 	dprintk("NFS reply create: %d\n", status);
 	return status;
@@ -539,15 +545,24 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr)
 		.fh		= &fhandle,
 		.fattr		= &fattr
 	};
-	int			status;
+	int mode = sattr->ia_mode;
+	int status;
 
 	dprintk("NFS call  mkdir %s\n", dentry->d_name.name);
 	dir_attr.valid = 0;
 	fattr.valid = 0;
+
+	sattr->ia_mode &= ~current->fs->umask;
+
 	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKDIR, &arg, &res, 0);
 	nfs_refresh_inode(dir, &dir_attr);
-	if (status == 0)
-		status = nfs_instantiate(dentry, &fhandle, &fattr);
+	if (status != 0)
+		goto out;
+	status = nfs_instantiate(dentry, &fhandle, &fattr);
+	if (status != 0)
+		goto out;
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+out:
 	dprintk("NFS reply mkdir: %d\n", status);
 	return status;
 }
@@ -642,6 +657,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		.fh		= &fh,
 		.fattr		= &fattr
 	};
+	mode_t mode = sattr->ia_mode;
 	int status;
 
 	switch (sattr->ia_mode & S_IFMT) {
@@ -654,12 +670,20 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 
 	dprintk("NFS call  mknod %s %u:%u\n", dentry->d_name.name,
 			MAJOR(rdev), MINOR(rdev));
+
+	sattr->ia_mode &= ~current->fs->umask;
+
 	dir_attr.valid = 0;
 	fattr.valid = 0;
 	status = rpc_call(NFS_CLIENT(dir), NFS3PROC_MKNOD, &arg, &res, 0);
 	nfs_refresh_inode(dir, &dir_attr);
-	if (status == 0)
-		status = nfs_instantiate(dentry, &fh, &fattr);
+	if (status != 0)
+		goto out;
+	status = nfs_instantiate(dentry, &fh, &fattr);
+	if (status != 0)
+		goto out;
+	status = nfs3_proc_set_default_acl(dir, dentry->d_inode, mode);
+out:
 	dprintk("NFS reply mknod: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 3a5e442ac776..7662c5131b47 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -478,6 +478,15 @@ extern void  nfs_readdata_release(struct rpc_task *task);
 extern struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type);
 extern int nfs3_proc_setacl(struct inode *inode, int type,
 			    struct posix_acl *acl);
+extern int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
+		mode_t mode);
+#else
+static inline int nfs3_proc_set_default_acl(struct inode *dir,
+					    struct inode *inode,
+					    mode_t mode)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V3_ACL */
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 5c6a9f7d92291c832d47e792ed1fafa44acb066e Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 22 Jun 2005 17:16:27 +0000
Subject: [PATCH] NFS: Cache the NFSv3 acls.

 Attach acls to inodes in the icache to avoid unnecessary GETACL RPC
 round-trips.  As long as the client doesn't retrieve any acls itself, only the
 default acls of exiting directories and the default and access acls of new
 directories will end up in the cache, which preserves some memory compared to
 always caching the access and default acl of all files.

 Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
 Acked-by: Olaf Kirch <okir@suse.de>
 Signed-off-by: Andrew Morton <akpm@osdl.org>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs3acl.c       | 100 +++++++++++++++++++++++++++++++++++++++++--------
 fs/nfs/nfs3proc.c      |   1 +
 include/linux/nfs_fs.h |  11 ++++++
 3 files changed, 97 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 89b6468700e7..451112ff9aa4 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -113,6 +113,69 @@ int nfs3_removexattr(struct dentry *dentry, const char *name)
 	return nfs3_proc_setacl(inode, type, NULL);
 }
 
+static void __nfs3_forget_cached_acls(struct nfs_inode *nfsi)
+{
+	if (nfsi->acl_access != ERR_PTR(-EAGAIN)) {
+		posix_acl_release(nfsi->acl_access);
+		nfsi->acl_access = ERR_PTR(-EAGAIN);
+	}
+	if (nfsi->acl_default != ERR_PTR(-EAGAIN)) {
+		posix_acl_release(nfsi->acl_default);
+		nfsi->acl_default = ERR_PTR(-EAGAIN);
+	}
+}
+
+void nfs3_forget_cached_acls(struct inode *inode)
+{
+	dprintk("NFS: nfs3_forget_cached_acls(%s/%ld)\n", inode->i_sb->s_id,
+		inode->i_ino);
+	spin_lock(&inode->i_lock);
+	__nfs3_forget_cached_acls(NFS_I(inode));
+	spin_unlock(&inode->i_lock);
+}
+
+static struct posix_acl *nfs3_get_cached_acl(struct inode *inode, int type)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct posix_acl *acl = ERR_PTR(-EAGAIN);
+
+	spin_lock(&inode->i_lock);
+	switch(type) {
+		case ACL_TYPE_ACCESS:
+			acl = nfsi->acl_access;
+			break;
+
+		case ACL_TYPE_DEFAULT:
+			acl = nfsi->acl_default;
+			break;
+
+		default:
+			return ERR_PTR(-EINVAL);
+	}
+	if (acl == ERR_PTR(-EAGAIN))
+		acl = ERR_PTR(-EAGAIN);
+	else
+		acl = posix_acl_dup(acl);
+	spin_unlock(&inode->i_lock);
+	dprintk("NFS: nfs3_get_cached_acl(%s/%ld, %d) = %p\n", inode->i_sb->s_id,
+		inode->i_ino, type, acl);
+	return acl;
+}
+
+static void nfs3_cache_acls(struct inode *inode, struct posix_acl *acl,
+		    struct posix_acl *dfacl)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	dprintk("nfs3_cache_acls(%s/%ld, %p, %p)\n", inode->i_sb->s_id,
+		inode->i_ino, acl, dfacl);
+	spin_lock(&inode->i_lock);
+	__nfs3_forget_cached_acls(NFS_I(inode));
+	nfsi->acl_access = posix_acl_dup(acl);
+	nfsi->acl_default = posix_acl_dup(dfacl);
+	spin_unlock(&inode->i_lock);
+}
+
 struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -126,26 +189,32 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 	struct nfs3_getaclres res = {
 		.fattr =	&fattr,
 	};
-	struct posix_acl *acl = NULL;
+	struct posix_acl *acl;
 	int status, count;
 
 	if (!nfs_server_capable(inode, NFS_CAP_ACLS))
 		return ERR_PTR(-EOPNOTSUPP);
 
-	switch (type) {
-		case ACL_TYPE_ACCESS:
-			args.mask = NFS_ACLCNT|NFS_ACL;
-			break;
-
-		case ACL_TYPE_DEFAULT:
-			if (!S_ISDIR(inode->i_mode))
-				return NULL;
-			args.mask = NFS_DFACLCNT|NFS_DFACL;
-			break;
-
-		default:
-			return ERR_PTR(-EINVAL);
-	}
+	status = nfs_revalidate_inode(server, inode);
+	if (status < 0)
+		return ERR_PTR(status);
+	acl = nfs3_get_cached_acl(inode, type);
+	if (acl != ERR_PTR(-EAGAIN))
+		return acl;
+	acl = NULL;
+
+	/*
+	 * Only get the access acl when explicitly requested: We don't
+	 * need it for access decisions, and only some applications use
+	 * it. Applications which request the access acl first are not
+	 * penalized from this optimization.
+	 */
+	if (type == ACL_TYPE_ACCESS)
+		args.mask |= NFS_ACLCNT|NFS_ACL;
+	if (S_ISDIR(inode->i_mode))
+		args.mask |= NFS_DFACLCNT|NFS_DFACL;
+	if (args.mask == 0)
+		return NULL;
 
 	dprintk("NFS call getacl\n");
 	status = rpc_call(server->client_acl, ACLPROC3_GETACL,
@@ -180,6 +249,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type)
 			res.acl_access = NULL;
 		}
 	}
+	nfs3_cache_acls(inode, res.acl_access, res.acl_default);
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index a9ddc196224d..7851569b31c6 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -882,4 +882,5 @@ struct nfs_rpc_ops	nfs_v3_clientops = {
 	.file_open	= nfs_open,
 	.file_release	= nfs_release,
 	.lock		= nfs3_proc_lock,
+	.clear_acl_cache = nfs3_forget_cached_acls,
 };
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7662c5131b47..4ceac9ddac93 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -91,6 +91,8 @@ struct nfs_open_context {
  */
 struct nfs_delegation;
 
+struct posix_acl;
+
 /*
  * nfs fs inode data in memory
  */
@@ -144,6 +146,10 @@ struct nfs_inode {
 	atomic_t		data_updates;
 
 	struct nfs_access_entry	cache_access;
+#ifdef CONFIG_NFS_V3_ACL
+	struct posix_acl	*acl_access;
+	struct posix_acl	*acl_default;
+#endif
 
 	/*
 	 * This is the cookie verifier used for NFSv3 readdir
@@ -480,6 +486,7 @@ extern int nfs3_proc_setacl(struct inode *inode, int type,
 			    struct posix_acl *acl);
 extern int nfs3_proc_set_default_acl(struct inode *dir, struct inode *inode,
 		mode_t mode);
+extern void nfs3_forget_cached_acls(struct inode *inode);
 #else
 static inline int nfs3_proc_set_default_acl(struct inode *dir,
 					    struct inode *inode,
@@ -487,6 +494,10 @@ static inline int nfs3_proc_set_default_acl(struct inode *dir,
 {
 	return 0;
 }
+
+static inline void nfs3_forget_cached_acls(struct inode *inode)
+{
+}
 #endif /* CONFIG_NFS_V3_ACL */
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 00a926422765064cb28e218d4837411c88bf6a3e Mon Sep 17 00:00:00 2001
From: Olivier Galibert <galibert@pobox.com>
Date: Wed, 22 Jun 2005 17:16:29 +0000
Subject: [PATCH] NFS: Hide NFS server-generated readdir cookies from userland

 NFSv3 currently returns the unsigned 64-bit cookie directly to
 userspace. The following patch causes the kernel to generate
 loff_t offsets for the benefit of userland.
 The current server-generated READDIR cookie is cached in the
 nfs_open_context instead of in filp->f_pos, so we still end up work
 correctly under directory insertions/deletion.

 Signed-off-by: Olivier Galibert <galibert@pobox.com>
 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 114 ++++++++++++++++++++++++++++++++++++++-----------
 fs/nfs/inode.c         |   2 +
 include/linux/nfs_fs.h |   3 ++
 3 files changed, 95 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2c6a95945684..fceef29c65a3 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -141,7 +141,9 @@ typedef struct {
 	struct page	*page;
 	unsigned long	page_index;
 	u32		*ptr;
-	u64		target;
+	u64		target_cookie;
+	int		target_index;
+	int		current_index;
 	struct nfs_entry *entry;
 	decode_dirent_t	decode;
 	int		plus;
@@ -225,14 +227,14 @@ void dir_page_release(nfs_readdir_descriptor_t *desc)
 
 /*
  * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the next entry.
+ * to readdir, find the next entry with cookie 'desc->target_cookie'.
  *
  * If the end of the buffer has been reached, return -EAGAIN, if not,
  * return the offset within the buffer of the next entry to be
  * read.
  */
 static inline
-int find_dirent(nfs_readdir_descriptor_t *desc, struct page *page)
+int find_dirent(nfs_readdir_descriptor_t *desc)
 {
 	struct nfs_entry *entry = desc->entry;
 	int		loop_count = 0,
@@ -240,7 +242,7 @@ int find_dirent(nfs_readdir_descriptor_t *desc, struct page *page)
 
 	while((status = dir_decode(desc)) == 0) {
 		dfprintk(VFS, "NFS: found cookie %Lu\n", (long long)entry->cookie);
-		if (entry->prev_cookie == desc->target)
+		if (entry->prev_cookie == desc->target_cookie)
 			break;
 		if (loop_count++ > 200) {
 			loop_count = 0;
@@ -252,8 +254,44 @@ int find_dirent(nfs_readdir_descriptor_t *desc, struct page *page)
 }
 
 /*
- * Find the given page, and call find_dirent() in order to try to
- * return the next entry.
+ * Given a pointer to a buffer that has already been filled by a call
+ * to readdir, find the entry at offset 'desc->target_index'.
+ *
+ * If the end of the buffer has been reached, return -EAGAIN, if not,
+ * return the offset within the buffer of the next entry to be
+ * read.
+ */
+static inline
+int find_dirent_index(nfs_readdir_descriptor_t *desc)
+{
+	struct nfs_entry *entry = desc->entry;
+	int		loop_count = 0,
+			status;
+
+	for(;;) {
+		status = dir_decode(desc);
+		if (status)
+			break;
+
+		dfprintk(VFS, "NFS: found cookie %Lu at index %d\n", (long long)entry->cookie, desc->current_index);
+
+		if (desc->target_index == desc->current_index) {
+			desc->target_cookie = entry->cookie;
+			break;
+		}
+		desc->current_index++;
+		if (loop_count++ > 200) {
+			loop_count = 0;
+			schedule();
+		}
+	}
+	dfprintk(VFS, "NFS: find_dirent_index() returns %d\n", status);
+	return status;
+}
+
+/*
+ * Find the given page, and call find_dirent() or find_dirent_index in
+ * order to try to return the next entry.
  */
 static inline
 int find_dirent_page(nfs_readdir_descriptor_t *desc)
@@ -276,7 +314,10 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	/* NOTE: Someone else may have changed the READDIRPLUS flag */
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	status = find_dirent(desc, page);
+	if (desc->target_cookie)
+		status = find_dirent(desc);
+	else
+		status = find_dirent_index(desc);
 	if (status < 0)
 		dir_page_release(desc);
  out:
@@ -291,7 +332,8 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
  * Recurse through the page cache pages, and return a
  * filled nfs_entry structure of the next directory entry if possible.
  *
- * The target for the search is 'desc->target'.
+ * The target for the search is 'desc->target_cookie' if non-0,
+ * 'desc->target_index' otherwise
  */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
@@ -299,7 +341,19 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 	int		loop_count = 0;
 	int		res;
 
-	dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (long long)desc->target);
+	if (desc->target_cookie)
+		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (long long)desc->target_cookie);
+	else
+		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie number %d\n", desc->target_index);
+
+	/* Always search-by-index from the beginning of the cache */
+	if (!(desc->target_cookie)) {
+		desc->page_index = 0;
+		desc->entry->cookie = desc->entry->prev_cookie = 0;
+		desc->entry->eof = 0;
+		desc->current_index = 0;
+	}
+
 	for (;;) {
 		res = find_dirent_page(desc);
 		if (res != -EAGAIN)
@@ -332,11 +386,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct file	*file = desc->file;
 	struct nfs_entry *entry = desc->entry;
 	struct dentry	*dentry = NULL;
+	struct nfs_open_context *ctx = file->private_data;
 	unsigned long	fileid;
 	int		loop_count = 0,
 			res;
 
-	dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->target);
+	dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)entry->cookie);
 
 	for(;;) {
 		unsigned d_type = DT_UNKNOWN;
@@ -356,10 +411,11 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		}
 
 		res = filldir(dirent, entry->name, entry->len, 
-			      entry->prev_cookie, fileid, d_type);
+			      file->f_pos, fileid, d_type);
 		if (res < 0)
 			break;
-		file->f_pos = desc->target = entry->cookie;
+		file->f_pos++;
+		desc->target_cookie = entry->cookie;
 		if (dir_decode(desc) != 0) {
 			desc->page_index ++;
 			break;
@@ -369,10 +425,12 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 			schedule();
 		}
 	}
+	ctx->dir_pos        = file->f_pos;
+	ctx->dir_cookie     = desc->target_cookie;
 	dir_page_release(desc);
 	if (dentry != NULL)
 		dput(dentry);
-	dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target, res);
+	dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target_cookie, res);
 	return res;
 }
 
@@ -398,14 +456,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct page	*page = NULL;
 	int		status;
 
-	dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (long long)desc->target);
+	dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (long long)desc->target_cookie);
 
 	page = alloc_page(GFP_HIGHUSER);
 	if (!page) {
 		status = -ENOMEM;
 		goto out;
 	}
-	desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->target,
+	desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->target_cookie,
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
@@ -414,7 +472,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
 		if ((status = dir_decode(desc)) == 0)
-			desc->entry->prev_cookie = desc->target;
+			desc->entry->prev_cookie = desc->target_cookie;
 	} else
 		status = -EIO;
 	if (status < 0)
@@ -435,13 +493,15 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	goto out;
 }
 
-/* The file offset position is now represented as a true offset into the
- * page cache as is the case in most of the other filesystems.
+/* The file offset position represents the dirent entry number.  A
+   last cookie cache takes care of the common case of reading the
+   whole directory.
  */
 static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct dentry	*dentry = filp->f_dentry;
 	struct inode	*inode = dentry->d_inode;
+	struct nfs_open_context *ctx = filp->private_data;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
 	struct nfs_entry my_entry;
@@ -458,17 +518,22 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	}
 
 	/*
-	 * filp->f_pos points to the file offset in the page cache.
-	 * but if the cache has meanwhile been zapped, we need to
-	 * read from the last dirent to revalidate f_pos
-	 * itself.
+	 * filp->f_pos points to the dirent entry number.
+	 * ctx->dir_pos has the number of the cached cookie.  We have
+	 * to either find the entry with the appropriate number or
+	 * revalidate the cookie.
 	 */
 	memset(desc, 0, sizeof(*desc));
 
 	desc->file = filp;
-	desc->target = filp->f_pos;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
+	desc->target_index = filp->f_pos;
+
+	if (filp->f_pos == ctx->dir_pos)
+		desc->target_cookie = ctx->dir_cookie;
+	else
+		desc->target_cookie = 0;
 
 	my_entry.cookie = my_entry.prev_cookie = 0;
 	my_entry.eof = 0;
@@ -478,9 +543,10 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	while(!desc->entry->eof) {
 		res = readdir_search_pagecache(desc);
+
 		if (res == -EBADCOOKIE) {
 			/* This means either end of directory */
-			if (desc->entry->cookie != desc->target) {
+			if (desc->target_cookie && desc->entry->cookie != desc->target_cookie) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc, dirent, filldir);
 				if (res >= 0)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8a8d57d9d660..9fa02e7984ac 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -891,6 +891,8 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rp
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
 		ctx->error = 0;
+		ctx->dir_pos = 0;
+		ctx->dir_cookie = 0;
 	}
 	return ctx;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4ceac9ddac93..f810195ef7ad 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -84,6 +84,9 @@ struct nfs_open_context {
 	int error;
 
 	struct list_head list;
+
+	int dir_pos;		/* Directory cookie cache */
+	__u64 dir_cookie;
 };
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From f0dd2136da6d2070e12bfa6d199b136318e666c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:29 +0000
Subject: [PATCH] NFS: Clean up readdir changes.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c           | 85 ++++++++++++++++++++++++++++----------------------
 fs/nfs/inode.c         |  1 -
 include/linux/nfs_fs.h |  1 -
 3 files changed, 48 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index fceef29c65a3..b38a57e78a63 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -51,8 +51,10 @@ static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
 		      struct inode *, struct dentry *);
 static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 
 struct file_operations nfs_dir_operations = {
+	.llseek		= nfs_llseek_dir,
 	.read		= generic_read_dir,
 	.readdir	= nfs_readdir,
 	.open		= nfs_opendir,
@@ -141,9 +143,8 @@ typedef struct {
 	struct page	*page;
 	unsigned long	page_index;
 	u32		*ptr;
-	u64		target_cookie;
-	int		target_index;
-	int		current_index;
+	u64		*dir_cookie;
+	loff_t		current_index;
 	struct nfs_entry *entry;
 	decode_dirent_t	decode;
 	int		plus;
@@ -227,7 +228,7 @@ void dir_page_release(nfs_readdir_descriptor_t *desc)
 
 /*
  * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the next entry with cookie 'desc->target_cookie'.
+ * to readdir, find the next entry with cookie '*desc->dir_cookie'.
  *
  * If the end of the buffer has been reached, return -EAGAIN, if not,
  * return the offset within the buffer of the next entry to be
@@ -241,8 +242,8 @@ int find_dirent(nfs_readdir_descriptor_t *desc)
 			status;
 
 	while((status = dir_decode(desc)) == 0) {
-		dfprintk(VFS, "NFS: found cookie %Lu\n", (long long)entry->cookie);
-		if (entry->prev_cookie == desc->target_cookie)
+		dfprintk(VFS, "NFS: found cookie %Lu\n", (unsigned long long)entry->cookie);
+		if (entry->prev_cookie == *desc->dir_cookie)
 			break;
 		if (loop_count++ > 200) {
 			loop_count = 0;
@@ -255,7 +256,7 @@ int find_dirent(nfs_readdir_descriptor_t *desc)
 
 /*
  * Given a pointer to a buffer that has already been filled by a call
- * to readdir, find the entry at offset 'desc->target_index'.
+ * to readdir, find the entry at offset 'desc->file->f_pos'.
  *
  * If the end of the buffer has been reached, return -EAGAIN, if not,
  * return the offset within the buffer of the next entry to be
@@ -273,10 +274,10 @@ int find_dirent_index(nfs_readdir_descriptor_t *desc)
 		if (status)
 			break;
 
-		dfprintk(VFS, "NFS: found cookie %Lu at index %d\n", (long long)entry->cookie, desc->current_index);
+		dfprintk(VFS, "NFS: found cookie %Lu at index %Ld\n", (unsigned long long)entry->cookie, desc->current_index);
 
-		if (desc->target_index == desc->current_index) {
-			desc->target_cookie = entry->cookie;
+		if (desc->file->f_pos == desc->current_index) {
+			*desc->dir_cookie = entry->cookie;
 			break;
 		}
 		desc->current_index++;
@@ -314,7 +315,7 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
 	/* NOTE: Someone else may have changed the READDIRPLUS flag */
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
-	if (desc->target_cookie)
+	if (*desc->dir_cookie != 0)
 		status = find_dirent(desc);
 	else
 		status = find_dirent_index(desc);
@@ -332,8 +333,8 @@ int find_dirent_page(nfs_readdir_descriptor_t *desc)
  * Recurse through the page cache pages, and return a
  * filled nfs_entry structure of the next directory entry if possible.
  *
- * The target for the search is 'desc->target_cookie' if non-0,
- * 'desc->target_index' otherwise
+ * The target for the search is '*desc->dir_cookie' if non-0,
+ * 'desc->file->f_pos' otherwise
  */
 static inline
 int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
@@ -341,18 +342,15 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
 	int		loop_count = 0;
 	int		res;
 
-	if (desc->target_cookie)
-		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (long long)desc->target_cookie);
-	else
-		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie number %d\n", desc->target_index);
-
 	/* Always search-by-index from the beginning of the cache */
-	if (!(desc->target_cookie)) {
+	if (*desc->dir_cookie == 0) {
+		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for offset %Ld\n", (long long)desc->file->f_pos);
 		desc->page_index = 0;
 		desc->entry->cookie = desc->entry->prev_cookie = 0;
 		desc->entry->eof = 0;
 		desc->current_index = 0;
-	}
+	} else
+		dfprintk(VFS, "NFS: readdir_search_pagecache() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie);
 
 	for (;;) {
 		res = find_dirent_page(desc);
@@ -386,7 +384,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct file	*file = desc->file;
 	struct nfs_entry *entry = desc->entry;
 	struct dentry	*dentry = NULL;
-	struct nfs_open_context *ctx = file->private_data;
 	unsigned long	fileid;
 	int		loop_count = 0,
 			res;
@@ -415,7 +412,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 		if (res < 0)
 			break;
 		file->f_pos++;
-		desc->target_cookie = entry->cookie;
+		*desc->dir_cookie = entry->cookie;
 		if (dir_decode(desc) != 0) {
 			desc->page_index ++;
 			break;
@@ -425,12 +422,10 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
 			schedule();
 		}
 	}
-	ctx->dir_pos        = file->f_pos;
-	ctx->dir_cookie     = desc->target_cookie;
 	dir_page_release(desc);
 	if (dentry != NULL)
 		dput(dentry);
-	dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->target_cookie, res);
+	dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res);
 	return res;
 }
 
@@ -456,14 +451,14 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	struct page	*page = NULL;
 	int		status;
 
-	dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (long long)desc->target_cookie);
+	dfprintk(VFS, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie);
 
 	page = alloc_page(GFP_HIGHUSER);
 	if (!page) {
 		status = -ENOMEM;
 		goto out;
 	}
-	desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, desc->target_cookie,
+	desc->error = NFS_PROTO(inode)->readdir(file->f_dentry, cred, *desc->dir_cookie,
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
@@ -472,7 +467,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
 		if ((status = dir_decode(desc)) == 0)
-			desc->entry->prev_cookie = desc->target_cookie;
+			desc->entry->prev_cookie = *desc->dir_cookie;
 	} else
 		status = -EIO;
 	if (status < 0)
@@ -501,7 +496,6 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
 	struct dentry	*dentry = filp->f_dentry;
 	struct inode	*inode = dentry->d_inode;
-	struct nfs_open_context *ctx = filp->private_data;
 	nfs_readdir_descriptor_t my_desc,
 			*desc = &my_desc;
 	struct nfs_entry my_entry;
@@ -519,21 +513,16 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 	/*
 	 * filp->f_pos points to the dirent entry number.
-	 * ctx->dir_pos has the number of the cached cookie.  We have
+	 * *desc->dir_cookie has the cookie for the next entry. We have
 	 * to either find the entry with the appropriate number or
 	 * revalidate the cookie.
 	 */
 	memset(desc, 0, sizeof(*desc));
 
 	desc->file = filp;
+	desc->dir_cookie = &((struct nfs_open_context *)filp->private_data)->dir_cookie;
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = NFS_USE_READDIRPLUS(inode);
-	desc->target_index = filp->f_pos;
-
-	if (filp->f_pos == ctx->dir_pos)
-		desc->target_cookie = ctx->dir_cookie;
-	else
-		desc->target_cookie = 0;
 
 	my_entry.cookie = my_entry.prev_cookie = 0;
 	my_entry.eof = 0;
@@ -546,7 +535,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 
 		if (res == -EBADCOOKIE) {
 			/* This means either end of directory */
-			if (desc->target_cookie && desc->entry->cookie != desc->target_cookie) {
+			if (*desc->dir_cookie && desc->entry->cookie != *desc->dir_cookie) {
 				/* Or that the server has 'lost' a cookie */
 				res = uncached_readdir(desc, dirent, filldir);
 				if (res >= 0)
@@ -579,6 +568,28 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	return 0;
 }
 
+loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int origin)
+{
+	down(&filp->f_dentry->d_inode->i_sem);
+	switch (origin) {
+		case 1:
+			offset += filp->f_pos;
+		case 0:
+			if (offset >= 0)
+				break;
+		default:
+			offset = -EINVAL;
+			goto out;
+	}
+	if (offset != filp->f_pos) {
+		filp->f_pos = offset;
+		((struct nfs_open_context *)filp->private_data)->dir_cookie = 0;
+	}
+out:
+	up(&filp->f_dentry->d_inode->i_sem);
+	return offset;
+}
+
 /*
  * All directory operations under NFS are synchronous, so fsync()
  * is a dummy operation.
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9fa02e7984ac..6300e05e9463 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -891,7 +891,6 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rp
 		ctx->state = NULL;
 		ctx->lockowner = current->files;
 		ctx->error = 0;
-		ctx->dir_pos = 0;
 		ctx->dir_cookie = 0;
 	}
 	return ctx;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f810195ef7ad..c90313bfa435 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -85,7 +85,6 @@ struct nfs_open_context {
 
 	struct list_head list;
 
-	int dir_pos;		/* Directory cookie cache */
 	__u64 dir_cookie;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 951a143b3fcf15cfa9d38250b7462f821db241db Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:30 +0000
Subject: [PATCH] NFS: Fix the file size revalidation

 Instead of looking at whether or not the file is open for writes before
 we accept to update the length using the server value, we should rather
 be looking at whether or not we are currently caching any writes.

 Failure to do so means in particular that we're not updating the file
 length correctly after obtaining a POSIX or BSD lock.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/direct.c        |  2 +-
 fs/nfs/inode.c         | 69 +++++++++++++-------------------------------------
 fs/nfs/write.c         |  4 +--
 include/linux/nfs_fs.h |  1 -
 4 files changed, 21 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 68df803f27ca..d6a30c844de3 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -517,7 +517,7 @@ retry:
 	result = tot_bytes;
 
 out:
-	nfs_end_data_update_defer(inode);
+	nfs_end_data_update(inode);
 	nfs_writedata_free(wdata);
 	return result;
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6300e05e9463..b2d16758ced8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1147,27 +1147,6 @@ void nfs_end_data_update(struct inode *inode)
 	atomic_dec(&nfsi->data_updates);
 }
 
-/**
- * nfs_end_data_update_defer
- * @inode - pointer to inode
- * Declare end of the operations that will update file data
- * This will defer marking the inode as needing revalidation
- * unless there are no other pending updates.
- */
-void nfs_end_data_update_defer(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	if (atomic_dec_and_test(&nfsi->data_updates)) {
-		/* Mark the attribute cache for revalidation */
-		nfsi->flags |= NFS_INO_INVALID_ATTR;
-		/* Directories and symlinks: invalidate page cache too */
-		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			nfsi->flags |= NFS_INO_INVALID_DATA;
-		nfsi->cache_change_attribute ++;
-	}
-}
-
 /**
  * nfs_refresh_inode - verify consistency of the inode attribute cache
  * @inode - pointer to inode
@@ -1222,8 +1201,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)
 				|| cur_size != new_isize)
 			nfsi->flags |= NFS_INO_INVALID_ATTR;
-	} else if (S_ISREG(inode->i_mode) && new_isize > cur_size)
-			nfsi->flags |= NFS_INO_INVALID_ATTR;
+	} else if (new_isize != cur_size && nfsi->npages == 0)
+		nfsi->flags |= NFS_INO_INVALID_ATTR;
 
 	/* Have any file permissions changed? */
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
@@ -1257,10 +1236,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsigned long verifier)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	__u64		new_size;
-	loff_t		new_isize;
+	loff_t cur_isize, new_isize;
 	unsigned int	invalid = 0;
-	loff_t		cur_isize;
 	int data_unstable;
 
 	dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
@@ -1293,49 +1270,39 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 	/* Are we racing with known updates of the metadata on the server? */
 	data_unstable = ! nfs_verify_change_attribute(inode, verifier);
 
-	/* Check if the file size agrees */
-	new_size = fattr->size;
+	/* Check if our cached file size is stale */
  	new_isize = nfs_size_to_loff_t(fattr->size);
 	cur_isize = i_size_read(inode);
-	if (cur_isize != new_size) {
-#ifdef NFS_DEBUG_VERBOSE
-		printk(KERN_DEBUG "NFS: isize change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino);
-#endif
-		/*
-		 * If we have pending writebacks, things can get
-		 * messy.
-		 */
-		if (S_ISREG(inode->i_mode) && data_unstable) {
-			if (new_isize > cur_isize) {
+	if (new_isize != cur_isize) {
+		/* Do we perhaps have any outstanding writes? */
+		if (nfsi->npages == 0) {
+			/* No, but did we race with nfs_end_data_update()? */
+			if (verifier  ==  nfsi->cache_change_attribute) {
 				inode->i_size = new_isize;
-				invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+				invalid |= NFS_INO_INVALID_DATA;
 			}
-		} else {
+			invalid |= NFS_INO_INVALID_ATTR;
+		} else if (new_isize > cur_isize) {
 			inode->i_size = new_isize;
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 		}
+		dprintk("NFS: isize change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
 	}
 
-	/*
-	 * Note: we don't check inode->i_mtime since pipes etc.
-	 *       can change this value in VFS without requiring a
-	 *	 cache revalidation.
-	 */
+	/* Check if the mtime agrees */
 	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 		memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
-#ifdef NFS_DEBUG_VERBOSE
-		printk(KERN_DEBUG "NFS: mtime change on %s/%ld\n", inode->i_sb->s_id, inode->i_ino);
-#endif
+		dprintk("NFS: mtime change on server for file %s/%ld\n",
+				inode->i_sb->s_id, inode->i_ino);
 		if (!data_unstable)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
 	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR_V4)
 	    && nfsi->change_attr != fattr->change_attr) {
-#ifdef NFS_DEBUG_VERBOSE
-		printk(KERN_DEBUG "NFS: change_attr change on %s/%ld\n",
+		dprintk("NFS: change_attr change on server for file %s/%ld\n",
 		       inode->i_sb->s_id, inode->i_ino);
-#endif
 		nfsi->change_attr = fattr->change_attr;
 		if (!data_unstable)
 			invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 6f7a4af3bc46..c574d551f029 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -220,7 +220,7 @@ static int nfs_writepage_sync(struct nfs_open_context *ctx, struct inode *inode,
 		ClearPageError(page);
 
 io_error:
-	nfs_end_data_update_defer(inode);
+	nfs_end_data_update(inode);
 	nfs_writedata_free(wdata);
 	return written ? written : result;
 }
@@ -401,7 +401,7 @@ static void nfs_inode_remove_request(struct nfs_page *req)
 	nfsi->npages--;
 	if (!nfsi->npages) {
 		spin_unlock(&nfsi->req_lock);
-		nfs_end_data_update_defer(inode);
+		nfs_end_data_update(inode);
 		iput(inode);
 	} else
 		spin_unlock(&nfsi->req_lock);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index c90313bfa435..211266c56ce5 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -294,7 +294,6 @@ extern void nfs_begin_attr_update(struct inode *);
 extern void nfs_end_attr_update(struct inode *);
 extern void nfs_begin_data_update(struct inode *);
 extern void nfs_end_data_update(struct inode *);
-extern void nfs_end_data_update_defer(struct inode *);
 extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, struct rpc_cred *cred);
 extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx);
 extern void put_nfs_open_context(struct nfs_open_context *ctx);
-- 
cgit v1.2.3-59-g8ed1b


From 7d52e86274e09fce8ac8f963e3605a84d0a305a7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:30 +0000
Subject: [PATCH] NFS: Cleanup of caching code, and slight optimization of
 writes.

 Unless we're doing O_APPEND writes, we really don't care about revalidating
 the file length. Just make sure that we catch any page cache invalidations.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          | 12 +++++++++---
 fs/nfs/inode.c         | 44 +++++++++++++++++++++++++++++---------------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index a606708264ed..40436857ed42 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -333,9 +333,15 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 	result = -EBUSY;
 	if (IS_SWAPFILE(inode))
 		goto out_swapfile;
-	result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
-	if (result)
-		goto out;
+	/*
+	 * O_APPEND implies that we must revalidate the file length.
+	 */
+	if (iocb->ki_filp->f_flags & O_APPEND) {
+		result = nfs_revalidate_file_size(inode, iocb->ki_filp);
+		if (result)
+			goto out;
+	} else
+		nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index b2d16758ced8..a3922f4cc0a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1062,21 +1062,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (verifier == nfsi->cache_change_attribute)
 		nfsi->flags &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
 	/* Do the page cache invalidation */
-	if (flags & NFS_INO_INVALID_DATA) {
-		if (S_ISREG(inode->i_mode)) {
-			if (filemap_fdatawrite(inode->i_mapping) == 0)
-				filemap_fdatawait(inode->i_mapping);
-			nfs_wb_all(inode);
-		}
-		nfsi->flags &= ~NFS_INO_INVALID_DATA;
-		invalidate_inode_pages2(inode->i_mapping);
-		memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
-		dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
-				inode->i_sb->s_id,
-				(long long)NFS_FILEID(inode));
-		/* This ensures we revalidate dentries */
-		nfsi->cache_change_attribute++;
-	}
+	nfs_revalidate_mapping(inode, inode->i_mapping);
 	if (flags & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
@@ -1115,6 +1101,34 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	return __nfs_revalidate_inode(server, inode);
 }
 
+/**
+ * nfs_revalidate_mapping - Revalidate the pagecache
+ * @inode - pointer to host inode
+ * @mapping - pointer to mapping
+ */
+void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	if (nfsi->flags & NFS_INO_INVALID_DATA) {
+		if (S_ISREG(inode->i_mode)) {
+			if (filemap_fdatawrite(mapping) == 0)
+				filemap_fdatawait(mapping);
+			nfs_wb_all(inode);
+		}
+		invalidate_inode_pages2(mapping);
+		nfsi->flags &= ~NFS_INO_INVALID_DATA;
+		if (S_ISDIR(inode->i_mode)) {
+			memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+			/* This ensures we revalidate child dentries */
+			nfsi->cache_change_attribute++;
+		}
+		dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
+				inode->i_sb->s_id,
+				(long long)NFS_FILEID(inode));
+	}
+}
+
 /**
  * nfs_begin_data_update
  * @inode - pointer to inode
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 211266c56ce5..443103c13e53 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -289,6 +289,7 @@ extern int nfs_release(struct inode *, struct file *);
 extern int nfs_attribute_timeout(struct inode *inode);
 extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
+extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
 extern void nfs_begin_attr_update(struct inode *);
 extern void nfs_end_attr_update(struct inode *);
-- 
cgit v1.2.3-59-g8ed1b


From fe51beecc55d0b0dce289e4758e7c529a642f63e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:30 +0000
Subject: [PATCH] NFS: Ensure that fstat() always returns the correct mtime

 Even if the file is open for writes.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          | 28 ++++++++++++++++++++++------
 fs/nfs/inode.c         | 24 ++++++++++++++++--------
 include/linux/nfs_fs.h |  1 +
 3 files changed, 39 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 40436857ed42..5621ba9885f4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -127,6 +127,21 @@ nfs_file_release(struct inode *inode, struct file *filp)
 	return NFS_PROTO(inode)->file_release(inode, filp);
 }
 
+/**
+ * nfs_revalidate_file - Revalidate the page cache & related metadata
+ * @inode - pointer to inode struct
+ * @file - pointer to file
+ */
+static int nfs_revalidate_file(struct inode *inode, struct file *filp)
+{
+	int retval = 0;
+
+	if ((NFS_FLAGS(inode) & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode))
+		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	nfs_revalidate_mapping(inode, filp->f_mapping);
+	return 0;
+}
+
 /**
  * nfs_revalidate_size - Revalidate the file size
  * @inode - pointer to inode struct
@@ -149,7 +164,8 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 		goto force_reval;
 	if (nfsi->npages != 0)
 		return 0;
-	return nfs_revalidate_inode(server, inode);
+	if (!(NFS_FLAGS(inode) & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
+		return 0;
 force_reval:
 	return __nfs_revalidate_inode(server, inode);
 }
@@ -210,7 +226,7 @@ nfs_file_read(struct kiocb *iocb, char __user * buf, size_t count, loff_t pos)
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long) pos);
 
-	result = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	result = nfs_revalidate_file(inode, iocb->ki_filp);
 	if (!result)
 		result = generic_file_aio_read(iocb, buf, count, pos);
 	return result;
@@ -228,7 +244,7 @@ nfs_file_sendfile(struct file *filp, loff_t *ppos, size_t count,
 		dentry->d_parent->d_name.name, dentry->d_name.name,
 		(unsigned long) count, (unsigned long long) *ppos);
 
-	res = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	res = nfs_revalidate_file(inode, filp);
 	if (!res)
 		res = generic_file_sendfile(filp, ppos, count, actor, target);
 	return res;
@@ -244,7 +260,7 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 	dfprintk(VFS, "nfs: mmap(%s/%s)\n",
 		dentry->d_parent->d_name.name, dentry->d_name.name);
 
-	status = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	status = nfs_revalidate_file(inode, file);
 	if (!status)
 		status = generic_file_mmap(file, vma);
 	return status;
@@ -340,8 +356,8 @@ nfs_file_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t
 		result = nfs_revalidate_file_size(inode, iocb->ki_filp);
 		if (result)
 			goto out;
-	} else
-		nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
+	}
+	nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
 
 	result = count;
 	if (!count)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index a3922f4cc0a8..4f545f382ba6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -620,9 +620,9 @@ nfs_zap_caches(struct inode *inode)
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 	else
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 }
 
 static void nfs_zap_acl_cache(struct inode *inode)
@@ -1055,6 +1055,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		goto out;
 	}
 	flags = nfsi->flags;
+	nfsi->flags &= ~NFS_INO_REVAL_PAGECACHE;
 	/*
 	 * We may need to keep the attributes marked as invalid if
 	 * we raced with nfs_end_attr_update().
@@ -1187,8 +1188,11 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 		if ((fattr->valid & NFS_ATTR_PRE_CHANGE) != 0
 				&& nfsi->change_attr == fattr->pre_change_attr)
 			nfsi->change_attr = fattr->change_attr;
-		if (!data_unstable && nfsi->change_attr != fattr->change_attr)
+		if (nfsi->change_attr != fattr->change_attr) {
 			nfsi->flags |= NFS_INO_INVALID_ATTR;
+			if (!data_unstable)
+				nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+		}
 	}
 
 	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
@@ -1211,12 +1215,16 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	}
 
 	/* Verify a few of the more important attributes */
-	if (!data_unstable) {
-		if (!timespec_equal(&inode->i_mtime, &fattr->mtime)
-				|| cur_size != new_isize)
-			nfsi->flags |= NFS_INO_INVALID_ATTR;
-	} else if (new_isize != cur_size && nfsi->npages == 0)
+	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
 		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		if (!data_unstable)
+			nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+	}
+	if (cur_size != new_isize) {
+		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		if (nfsi->npages == 0)
+			nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+	}
 
 	/* Have any file permissions changed? */
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 443103c13e53..2954e44ed498 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -198,6 +198,7 @@ struct nfs_inode {
 #define NFS_INO_INVALID_ATIME	0x0020		/* cached atime is invalid */
 #define NFS_INO_INVALID_ACCESS	0x0040		/* cached access cred invalid */
 #define NFS_INO_INVALID_ACL	0x0080		/* cached acls are invalid */
+#define NFS_INO_REVAL_PAGECACHE	0x1000		/* must revalidate pagecache */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
-- 
cgit v1.2.3-59-g8ed1b


From c6a556b88adfacd2af90be84357c8165d716c27d Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:30 +0000
Subject: [PATCH] NFS: Make searching and waiting on busy writeback requests
 more efficient.

 Basically copies the VFS's method for tracking writebacks and applies
 it to the struct nfs_page.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/pagelist.c        | 29 ++++++++++++++++++++++++++++-
 fs/nfs/read.c            |  3 ---
 fs/nfs/write.c           | 19 +++++++++----------
 include/linux/nfs_page.h | 12 ++++++++----
 4 files changed, 45 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 80777f99a58a..356a33bb38a6 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -111,6 +111,33 @@ void nfs_unlock_request(struct nfs_page *req)
 	nfs_release_request(req);
 }
 
+/**
+ * nfs_set_page_writeback_locked - Lock a request for writeback
+ * @req:
+ */
+int nfs_set_page_writeback_locked(struct nfs_page *req)
+{
+	struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+
+	if (!nfs_lock_request(req))
+		return 0;
+	radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
+	return 1;
+}
+
+/**
+ * nfs_clear_page_writeback - Unlock request and wake up sleepers
+ */
+void nfs_clear_page_writeback(struct nfs_page *req)
+{
+	struct nfs_inode *nfsi = NFS_I(req->wb_context->dentry->d_inode);
+
+	spin_lock(&nfsi->req_lock);
+	radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_WRITEBACK);
+	spin_unlock(&nfsi->req_lock);
+	nfs_unlock_request(req);
+}
+
 /**
  * nfs_clear_request - Free up all resources allocated to the request
  * @req:
@@ -301,7 +328,7 @@ nfs_scan_list(struct list_head *head, struct list_head *dst,
 		if (req->wb_index > idx_end)
 			break;
 
-		if (!nfs_lock_request(req))
+		if (!nfs_set_page_writeback_locked(req))
 			continue;
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, dst);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a0042fb58634..6f866b8aa2d5 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -173,7 +173,6 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	if (len < PAGE_CACHE_SIZE)
 		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
 
-	nfs_lock_request(new);
 	nfs_list_add_request(new, &one_request);
 	nfs_pagein_one(&one_request, inode);
 	return 0;
@@ -185,7 +184,6 @@ static void nfs_readpage_release(struct nfs_page *req)
 
 	nfs_clear_request(req);
 	nfs_release_request(req);
-	nfs_unlock_request(req);
 
 	dprintk("NFS: read done (%s/%Ld %d@%Ld)\n",
 			req->wb_context->dentry->d_inode->i_sb->s_id,
@@ -553,7 +551,6 @@ readpage_async_filler(void *data, struct page *page)
 	}
 	if (len < PAGE_CACHE_SIZE)
 		memclear_highpage_flush(page, len, PAGE_CACHE_SIZE - len);
-	nfs_lock_request(new);
 	nfs_list_add_request(new, desc->head);
 	return 0;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 79b621a545b2..58a39b0486a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -503,13 +503,12 @@ nfs_wait_on_requests(struct inode *inode, unsigned long idx_start, unsigned int
 
 	spin_lock(&nfsi->req_lock);
 	next = idx_start;
-	while (radix_tree_gang_lookup(&nfsi->nfs_page_tree, (void **)&req, next, 1)) {
+	while (radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree, (void **)&req, next, 1, NFS_PAGE_TAG_WRITEBACK)) {
 		if (req->wb_index > idx_end)
 			break;
 
 		next = req->wb_index + 1;
-		if (!NFS_WBACK_BUSY(req))
-			continue;
+		BUG_ON(!NFS_WBACK_BUSY(req));
 
 		atomic_inc(&req->wb_count);
 		spin_unlock(&nfsi->req_lock);
@@ -821,7 +820,7 @@ out:
 #else
 	nfs_inode_remove_request(req);
 #endif
-	nfs_unlock_request(req);
+	nfs_clear_page_writeback(req);
 }
 
 static inline int flush_task_priority(int how)
@@ -952,7 +951,7 @@ out_bad:
 		nfs_writedata_free(data);
 	}
 	nfs_mark_request_dirty(req);
-	nfs_unlock_request(req);
+	nfs_clear_page_writeback(req);
 	return -ENOMEM;
 }
 
@@ -1002,7 +1001,7 @@ static int nfs_flush_one(struct list_head *head, struct inode *inode, int how)
 		struct nfs_page *req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_dirty(req);
-		nfs_unlock_request(req);
+		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
 }
@@ -1029,7 +1028,7 @@ nfs_flush_list(struct list_head *head, int wpages, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_dirty(req);
-		nfs_unlock_request(req);
+		nfs_clear_page_writeback(req);
 	}
 	return error;
 }
@@ -1121,7 +1120,7 @@ static void nfs_writeback_done_full(struct nfs_write_data *data, int status)
 		nfs_inode_remove_request(req);
 #endif
 	next:
-		nfs_unlock_request(req);
+		nfs_clear_page_writeback(req);
 	}
 }
 
@@ -1278,7 +1277,7 @@ nfs_commit_list(struct list_head *head, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_mark_request_commit(req);
-		nfs_unlock_request(req);
+		nfs_clear_page_writeback(req);
 	}
 	return -ENOMEM;
 }
@@ -1324,7 +1323,7 @@ nfs_commit_done(struct rpc_task *task)
 		dprintk(" mismatch\n");
 		nfs_mark_request_dirty(req);
 	next:
-		nfs_unlock_request(req);
+		nfs_clear_page_writeback(req);
 		res++;
 	}
 	sub_page_state(nr_unstable,res);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 39e4895bcdb4..db40e4590ba2 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -19,6 +19,11 @@
 
 #include <asm/atomic.h>
 
+/*
+ * Valid flags for the radix tree
+ */
+#define NFS_PAGE_TAG_WRITEBACK	1
+
 /*
  * Valid flags for a dirty buffer
  */
@@ -62,6 +67,9 @@ extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
 				  unsigned int);
 extern  int nfs_wait_on_request(struct nfs_page *);
 extern	void nfs_unlock_request(struct nfs_page *req);
+extern  int nfs_set_page_writeback_locked(struct nfs_page *req);
+extern  void nfs_clear_page_writeback(struct nfs_page *req);
+
 
 /*
  * Lock the page of an asynchronous request without incrementing the wb_count
@@ -96,10 +104,6 @@ nfs_list_remove_request(struct nfs_page *req)
 {
 	if (list_empty(&req->wb_list))
 		return;
-	if (!NFS_WBACK_BUSY(req)) {
-		printk(KERN_ERR "NFS: unlocked request attempted removed from list!\n");
-		BUG();
-	}
 	list_del_init(&req->wb_list);
 	req->wb_list_head = NULL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 3da28eb1c6545fe73263a24eba0996217490e1eb Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:31 +0000
Subject: [PATCH] NFS: Replace nfs_page insertion sort with a radix sort

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c           |  2 +-
 fs/nfs/pagelist.c        | 86 +++++++++++++++++++++++++++++++-----------------
 fs/nfs/write.c           | 71 ++++++++++++++++++---------------------
 include/linux/nfs_fs.h   |  4 +--
 include/linux/nfs_page.h | 18 ++++++++--
 5 files changed, 107 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4f545f382ba6..4845911f1c63 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -135,7 +135,7 @@ nfs_write_inode(struct inode *inode, int sync)
 	int flags = sync ? FLUSH_WAIT : 0;
 	int ret;
 
-	ret = nfs_commit_inode(inode, 0, 0, flags);
+	ret = nfs_commit_inode(inode, flags);
 	if (ret < 0)
 		return ret;
 	return 0;
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 356a33bb38a6..d53857b148e2 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -177,36 +177,6 @@ nfs_release_request(struct nfs_page *req)
 	nfs_page_free(req);
 }
 
-/**
- * nfs_list_add_request - Insert a request into a sorted list
- * @req: request
- * @head: head of list into which to insert the request.
- *
- * Note that the wb_list is sorted by page index in order to facilitate
- * coalescing of requests.
- * We use an insertion sort that is optimized for the case of appended
- * writes.
- */
-void
-nfs_list_add_request(struct nfs_page *req, struct list_head *head)
-{
-	struct list_head *pos;
-
-#ifdef NFS_PARANOIA
-	if (!list_empty(&req->wb_list)) {
-		printk(KERN_ERR "NFS: Add to list failed!\n");
-		BUG();
-	}
-#endif
-	list_for_each_prev(pos, head) {
-		struct nfs_page	*p = nfs_list_entry(pos);
-		if (p->wb_index < req->wb_index)
-			break;
-	}
-	list_add(&req->wb_list, pos);
-	req->wb_list_head = head;
-}
-
 static int nfs_wait_bit_interruptible(void *word)
 {
 	int ret = 0;
@@ -291,6 +261,62 @@ nfs_coalesce_requests(struct list_head *head, struct list_head *dst,
 	return npages;
 }
 
+#define NFS_SCAN_MAXENTRIES 16
+/**
+ * nfs_scan_lock_dirty - Scan the radix tree for dirty requests
+ * @nfsi: NFS inode
+ * @dst: Destination list
+ * @idx_start: lower bound of page->index to scan
+ * @npages: idx_start + npages sets the upper bound to scan.
+ *
+ * Moves elements from one of the inode request lists.
+ * If the number of requests is set to 0, the entire address_space
+ * starting at index idx_start, is scanned.
+ * The requests are *not* checked to ensure that they form a contiguous set.
+ * You must be holding the inode's req_lock when calling this function
+ */
+int
+nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
+	      unsigned long idx_start, unsigned int npages)
+{
+	struct nfs_page *pgvec[NFS_SCAN_MAXENTRIES];
+	struct nfs_page *req;
+	unsigned long idx_end;
+	int found, i;
+	int res;
+
+	res = 0;
+	if (npages == 0)
+		idx_end = ~0;
+	else
+		idx_end = idx_start + npages - 1;
+
+	for (;;) {
+		found = radix_tree_gang_lookup_tag(&nfsi->nfs_page_tree,
+				(void **)&pgvec[0], idx_start, NFS_SCAN_MAXENTRIES,
+				NFS_PAGE_TAG_DIRTY);
+		if (found <= 0)
+			break;
+		for (i = 0; i < found; i++) {
+			req = pgvec[i];
+			if (req->wb_index > idx_end)
+				goto out;
+
+			idx_start = req->wb_index + 1;
+
+			if (nfs_set_page_writeback_locked(req)) {
+				radix_tree_tag_clear(&nfsi->nfs_page_tree,
+						req->wb_index, NFS_PAGE_TAG_DIRTY);
+				nfs_list_remove_request(req);
+				nfs_list_add_request(req, dst);
+				res++;
+			}
+		}
+	}
+out:
+	return res;
+}
+
 /**
  * nfs_scan_list - Scan a list for matching requests
  * @head: One of the NFS inode request lists
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 58a39b0486a7..5130eda231d7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -352,7 +352,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 		if (err < 0)
 			goto out;
 	}
-	err = nfs_commit_inode(inode, 0, 0, wb_priority(wbc));
+	err = nfs_commit_inode(inode, wb_priority(wbc));
 	if (err > 0) {
 		wbc->nr_to_write -= err;
 		err = 0;
@@ -446,6 +446,8 @@ nfs_mark_request_dirty(struct nfs_page *req)
 	struct nfs_inode *nfsi = NFS_I(inode);
 
 	spin_lock(&nfsi->req_lock);
+	radix_tree_tag_set(&nfsi->nfs_page_tree,
+			req->wb_index, NFS_PAGE_TAG_DIRTY);
 	nfs_list_add_request(req, &nfsi->dirty);
 	nfsi->ndirty++;
 	spin_unlock(&nfsi->req_lock);
@@ -537,12 +539,15 @@ static int
 nfs_scan_dirty(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int	res;
-	res = nfs_scan_list(&nfsi->dirty, dst, idx_start, npages);
-	nfsi->ndirty -= res;
-	sub_page_state(nr_dirty,res);
-	if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
-		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
+	int res = 0;
+
+	if (nfsi->ndirty != 0) {
+		res = nfs_scan_lock_dirty(nfsi, dst, idx_start, npages);
+		nfsi->ndirty -= res;
+		sub_page_state(nr_dirty,res);
+		if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
+			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
+	}
 	return res;
 }
 
@@ -561,11 +566,14 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, unsigned long idx_start, unsigned int npages)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
-	int	res;
-	res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
-	nfsi->ncommit -= res;
-	if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
-		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
+	int res = 0;
+
+	if (nfsi->ncommit != 0) {
+		res = nfs_scan_list(&nfsi->commit, dst, idx_start, npages);
+		nfsi->ncommit -= res;
+		if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
+			printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
+	}
 	return res;
 }
 #endif
@@ -1209,36 +1217,24 @@ static void nfs_commit_rpcsetup(struct list_head *head,
 		struct nfs_write_data *data, int how)
 {
 	struct rpc_task		*task = &data->task;
-	struct nfs_page		*first, *last;
+	struct nfs_page		*first;
 	struct inode		*inode;
-	loff_t			start, end, len;
 
 	/* Set up the RPC argument and reply structs
 	 * NB: take care not to mess about with data->commit et al. */
 
 	list_splice_init(head, &data->pages);
 	first = nfs_list_entry(data->pages.next);
-	last = nfs_list_entry(data->pages.prev);
 	inode = first->wb_context->dentry->d_inode;
 
-	/*
-	 * Determine the offset range of requests in the COMMIT call.
-	 * We rely on the fact that data->pages is an ordered list...
-	 */
-	start = req_offset(first);
-	end = req_offset(last) + last->wb_bytes;
-	len = end - start;
-	/* If 'len' is not a 32-bit quantity, pass '0' in the COMMIT call */
-	if (end >= i_size_read(inode) || len < 0 || len > (~((u32)0) >> 1))
-		len = 0;
-
 	data->inode	  = inode;
 	data->cred	  = first->wb_context->cred;
 
 	data->args.fh     = NFS_FH(data->inode);
-	data->args.offset = start;
-	data->args.count  = len;
-	data->res.count   = len;
+	/* Note: we always request a commit of the entire inode */
+	data->args.offset = 0;
+	data->args.count  = 0;
+	data->res.count   = 0;
 	data->res.fattr   = &data->fattr;
 	data->res.verf    = &data->verf;
 	
@@ -1357,8 +1353,7 @@ static int nfs_flush_inode(struct inode *inode, unsigned long idx_start,
 }
 
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-int nfs_commit_inode(struct inode *inode, unsigned long idx_start,
-		    unsigned int npages, int how)
+int nfs_commit_inode(struct inode *inode, int how)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 	LIST_HEAD(head);
@@ -1366,15 +1361,13 @@ int nfs_commit_inode(struct inode *inode, unsigned long idx_start,
 				error = 0;
 
 	spin_lock(&nfsi->req_lock);
-	res = nfs_scan_commit(inode, &head, idx_start, npages);
+	res = nfs_scan_commit(inode, &head, 0, 0);
+	spin_unlock(&nfsi->req_lock);
 	if (res) {
-		res += nfs_scan_commit(inode, &head, 0, 0);
-		spin_unlock(&nfsi->req_lock);
 		error = nfs_commit_list(&head, how);
-	} else
-		spin_unlock(&nfsi->req_lock);
-	if (error < 0)
-		return error;
+		if (error < 0)
+			return error;
+	}
 	return res;
 }
 #endif
@@ -1396,7 +1389,7 @@ int nfs_sync_inode(struct inode *inode, unsigned long idx_start,
 			error = nfs_flush_inode(inode, idx_start, npages, how);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 		if (error == 0)
-			error = nfs_commit_inode(inode, idx_start, npages, how);
+			error = nfs_commit_inode(inode, how);
 #endif
 	} while (error > 0);
 	return error;
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 2954e44ed498..8ea249110fb0 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -395,10 +395,10 @@ extern void nfs_commit_done(struct rpc_task *);
  */
 extern int  nfs_sync_inode(struct inode *, unsigned long, unsigned int, int);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
-extern int  nfs_commit_inode(struct inode *, unsigned long, unsigned int, int);
+extern int  nfs_commit_inode(struct inode *, int);
 #else
 static inline int
-nfs_commit_inode(struct inode *inode, unsigned long idx_start, unsigned int npages, int how)
+nfs_commit_inode(struct inode *inode, int how)
 {
 	return 0;
 }
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index db40e4590ba2..da2e077b65e2 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -22,6 +22,7 @@
 /*
  * Valid flags for the radix tree
  */
+#define NFS_PAGE_TAG_DIRTY	0
 #define NFS_PAGE_TAG_WRITEBACK	1
 
 /*
@@ -31,6 +32,7 @@
 #define PG_NEED_COMMIT		1
 #define PG_NEED_RESCHED		2
 
+struct nfs_inode;
 struct nfs_page {
 	struct list_head	wb_list,	/* Defines state of page: */
 				*wb_list_head;	/*      read/write/commit */
@@ -59,8 +61,8 @@ extern	void nfs_clear_request(struct nfs_page *req);
 extern	void nfs_release_request(struct nfs_page *req);
 
 
-extern	void nfs_list_add_request(struct nfs_page *, struct list_head *);
-
+extern  int nfs_scan_lock_dirty(struct nfs_inode *nfsi, struct list_head *dst,
+				unsigned long idx_start, unsigned int npages);
 extern	int nfs_scan_list(struct list_head *, struct list_head *,
 			  unsigned long, unsigned int);
 extern	int nfs_coalesce_requests(struct list_head *, struct list_head *,
@@ -94,6 +96,18 @@ nfs_lock_request(struct nfs_page *req)
 	return 1;
 }
 
+/**
+ * nfs_list_add_request - Insert a request into a list
+ * @req: request
+ * @head: head of list into which to insert the request.
+ */
+static inline void
+nfs_list_add_request(struct nfs_page *req, struct list_head *head)
+{
+	list_add_tail(&req->wb_list, head);
+	req->wb_list_head = head;
+}
+
 
 /**
  * nfs_list_remove_request - Remove a request from its wb_list
-- 
cgit v1.2.3-59-g8ed1b


From ecdbf769b2cb8903e07cd482334c714d89fd1146 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:31 +0000
Subject: [PATCH] NLM: fix a client-side race on blocking locks.

 If the lock blocks, the server may send us a GRANTED message that
 races with the reply to our LOCK request. Make sure that we catch
 the GRANTED by queueing up our request on the nlm_blocked list
 before we send off the first LOCK rpc call.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/lockd/clntlock.c         | 99 ++++++++++++++++++++++++++-------------------
 fs/lockd/clntproc.c         | 40 ++++++++++++++----
 include/linux/lockd/lockd.h |  7 +++-
 3 files changed, 96 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 44adb84183b6..006bb9e14579 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -42,23 +42,51 @@ struct nlm_wait {
 static LIST_HEAD(nlm_blocked);
 
 /*
- * Block on a lock
+ * Queue up a lock for blocking so that the GRANTED request can see it
  */
-int
-nlmclnt_block(struct nlm_host *host, struct file_lock *fl, u32 *statp)
+int nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl)
+{
+	struct nlm_wait *block;
+
+	BUG_ON(req->a_block != NULL);
+	block = kmalloc(sizeof(*block), GFP_KERNEL);
+	if (block == NULL)
+		return -ENOMEM;
+	block->b_host = host;
+	block->b_lock = fl;
+	init_waitqueue_head(&block->b_wait);
+	block->b_status = NLM_LCK_BLOCKED;
+
+	list_add(&block->b_list, &nlm_blocked);
+	req->a_block = block;
+
+	return 0;
+}
+
+void nlmclnt_finish_block(struct nlm_rqst *req)
 {
-	struct nlm_wait	block, **head;
-	int		err;
-	u32		pstate;
+	struct nlm_wait *block = req->a_block;
 
-	block.b_host   = host;
-	block.b_lock   = fl;
-	init_waitqueue_head(&block.b_wait);
-	block.b_status = NLM_LCK_BLOCKED;
-	list_add(&block.b_list, &nlm_blocked);
+	if (block == NULL)
+		return;
+	req->a_block = NULL;
+	list_del(&block->b_list);
+	kfree(block);
+}
 
-	/* Remember pseudo nsm state */
-	pstate = host->h_state;
+/*
+ * Block on a lock
+ */
+long nlmclnt_block(struct nlm_rqst *req, long timeout)
+{
+	struct nlm_wait	*block = req->a_block;
+	long ret;
+
+	/* A borken server might ask us to block even if we didn't
+	 * request it. Just say no!
+	 */
+	if (!req->a_args.block)
+		return -EAGAIN;
 
 	/* Go to sleep waiting for GRANT callback. Some servers seem
 	 * to lose callbacks, however, so we're going to poll from
@@ -68,23 +96,16 @@ nlmclnt_block(struct nlm_host *host, struct file_lock *fl, u32 *statp)
 	 * a 1 minute timeout would do. See the comment before
 	 * nlmclnt_lock for an explanation.
 	 */
-	sleep_on_timeout(&block.b_wait, 30*HZ);
+	ret = wait_event_interruptible_timeout(block->b_wait,
+			block->b_status != NLM_LCK_BLOCKED,
+			timeout);
 
-	list_del(&block.b_list);
-
-	if (!signalled()) {
-		*statp = block.b_status;
-		return 0;
+	if (block->b_status != NLM_LCK_BLOCKED) {
+		req->a_res.status = block->b_status;
+		block->b_status = NLM_LCK_BLOCKED;
 	}
 
-	/* Okay, we were interrupted. Cancel the pending request
-	 * unless the server has rebooted.
-	 */
-	if (pstate == host->h_state && (err = nlmclnt_cancel(host, fl)) < 0)
-		printk(KERN_NOTICE
-			"lockd: CANCEL call failed (errno %d)\n", -err);
-
-	return -ERESTARTSYS;
+	return ret;
 }
 
 /*
@@ -94,27 +115,23 @@ u32
 nlmclnt_grant(struct nlm_lock *lock)
 {
 	struct nlm_wait	*block;
+	u32 res = nlm_lck_denied;
 
 	/*
 	 * Look up blocked request based on arguments. 
 	 * Warning: must not use cookie to match it!
 	 */
 	list_for_each_entry(block, &nlm_blocked, b_list) {
-		if (nlm_compare_locks(block->b_lock, &lock->fl))
-			break;
+		if (nlm_compare_locks(block->b_lock, &lock->fl)) {
+			/* Alright, we found a lock. Set the return status
+			 * and wake up the caller
+			 */
+			block->b_status = NLM_LCK_GRANTED;
+			wake_up(&block->b_wait);
+			res = nlm_granted;
+		}
 	}
-
-	/* Ooops, no blocked request found. */
-	if (block == NULL)
-		return nlm_lck_denied;
-
-	/* Alright, we found the lock. Set the return status and
-	 * wake up the caller.
-	 */
-	block->b_status = NLM_LCK_GRANTED;
-	wake_up(&block->b_wait);
-
-	return nlm_granted;
+	return res;
 }
 
 /*
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index a4407619b1f1..fd77ed1d710d 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -21,6 +21,7 @@
 
 #define NLMDBG_FACILITY		NLMDBG_CLIENT
 #define NLMCLNT_GRACE_WAIT	(5*HZ)
+#define NLMCLNT_POLL_TIMEOUT	(30*HZ)
 
 static int	nlmclnt_test(struct nlm_rqst *, struct file_lock *);
 static int	nlmclnt_lock(struct nlm_rqst *, struct file_lock *);
@@ -553,7 +554,8 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 {
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
-	int		status;
+	long timeout;
+	int status;
 
 	if (!host->h_monitored && nsm_monitor(host) < 0) {
 		printk(KERN_NOTICE "lockd: failed to monitor %s\n",
@@ -562,15 +564,32 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 		goto out;
 	}
 
-	do {
-		if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0) {
-			if (resp->status != NLM_LCK_BLOCKED)
-				break;
-			status = nlmclnt_block(host, fl, &resp->status);
-		}
+	if (req->a_args.block) {
+		status = nlmclnt_prepare_block(req, host, fl);
 		if (status < 0)
 			goto out;
-	} while (resp->status == NLM_LCK_BLOCKED && req->a_args.block);
+	}
+	for(;;) {
+		status = nlmclnt_call(req, NLMPROC_LOCK);
+		if (status < 0)
+			goto out_unblock;
+		if (resp->status != NLM_LCK_BLOCKED)
+			break;
+		/* Wait on an NLM blocking lock */
+		timeout = nlmclnt_block(req, NLMCLNT_POLL_TIMEOUT);
+		/* Did a reclaimer thread notify us of a server reboot? */
+		if (resp->status ==  NLM_LCK_DENIED_GRACE_PERIOD)
+			continue;
+		if (resp->status != NLM_LCK_BLOCKED)
+			break;
+		if (timeout >= 0)
+			continue;
+		/* We were interrupted. Send a CANCEL request to the server
+		 * and exit
+		 */
+		status = (int)timeout;
+		goto out_unblock;
+	}
 
 	if (resp->status == NLM_LCK_GRANTED) {
 		fl->fl_u.nfs_fl.state = host->h_state;
@@ -579,6 +598,11 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 		do_vfs_lock(fl);
 	}
 	status = nlm_stat_to_errno(resp->status);
+out_unblock:
+	nlmclnt_finish_block(req);
+	/* Cancel the blocked request if it is still pending */
+	if (resp->status == NLM_LCK_BLOCKED)
+		nlmclnt_cancel(host, fl);
 out:
 	nlmclnt_release_lockargs(req);
 	return status;
diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
index 0d9d22578212..16d4e5a08e1d 100644
--- a/include/linux/lockd/lockd.h
+++ b/include/linux/lockd/lockd.h
@@ -72,6 +72,8 @@ struct nlm_lockowner {
 	uint32_t pid;
 };
 
+struct nlm_wait;
+
 /*
  * Memory chunk for NLM client RPC request.
  */
@@ -81,6 +83,7 @@ struct nlm_rqst {
 	struct nlm_host *	a_host;		/* host handle */
 	struct nlm_args		a_args;		/* arguments */
 	struct nlm_res		a_res;		/* result */
+	struct nlm_wait *	a_block;
 	char			a_owner[NLMCLNT_OHSIZE];
 };
 
@@ -142,7 +145,9 @@ extern unsigned long		nlmsvc_timeout;
  * Lockd client functions
  */
 struct nlm_rqst * nlmclnt_alloc_call(void);
-int		  nlmclnt_block(struct nlm_host *, struct file_lock *, u32 *);
+int		  nlmclnt_prepare_block(struct nlm_rqst *req, struct nlm_host *host, struct file_lock *fl);
+void		  nlmclnt_finish_block(struct nlm_rqst *req);
+long		  nlmclnt_block(struct nlm_rqst *req, long timeout);
 int		  nlmclnt_cancel(struct nlm_host *, struct file_lock *);
 u32		  nlmclnt_grant(struct nlm_lock *);
 void		  nlmclnt_recovery(struct nlm_host *, u32);
-- 
cgit v1.2.3-59-g8ed1b


From 8d0a8a9d0ec790086c64d210af413ac351d89e35 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 22 Jun 2005 17:16:32 +0000
Subject: [PATCH] NFSv4: Clean up nfs4 lock state accounting

 Ensure that lock owner structures are not released prematurely.

 Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h         |   9 +--
 fs/nfs/nfs4proc.c        |  69 ++++++++----------
 fs/nfs/nfs4state.c       | 178 +++++++++++++++++++++--------------------------
 include/linux/fs.h       |   1 +
 include/linux/nfs_fs_i.h |   5 ++
 5 files changed, 118 insertions(+), 144 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7c6f1d668fbd..ec1a22d7b876 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -128,6 +128,7 @@ struct nfs4_state_owner {
 
 struct nfs4_lock_state {
 	struct list_head	ls_locks;	/* Other lock stateids */
+	struct nfs4_state *	ls_state;	/* Pointer to open state */
 	fl_owner_t		ls_owner;	/* POSIX lock owner */
 #define NFS_LOCK_INITIALIZED 1
 	int			ls_flags;
@@ -153,7 +154,7 @@ struct nfs4_state {
 
 	unsigned long flags;		/* Do we hold any locks? */
 	struct semaphore lock_sema;	/* Serializes file locking operations */
-	rwlock_t state_lock;		/* Protects the lock_states list */
+	spinlock_t state_lock;		/* Protects the lock_states list */
 
 	nfs4_stateid stateid;
 
@@ -225,12 +226,8 @@ extern void nfs4_close_state(struct nfs4_state *, mode_t);
 extern struct nfs4_state *nfs4_find_state(struct inode *, struct rpc_cred *, mode_t mode);
 extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp);
 extern void nfs4_schedule_state_recovery(struct nfs4_client *);
-extern struct nfs4_lock_state *nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t);
-extern struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t);
-extern void nfs4_put_lock_state(struct nfs4_lock_state *state);
+extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *ls);
-extern void nfs4_notify_setlk(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
-extern void nfs4_notify_unlck(struct nfs4_state *, struct file_lock *, struct nfs4_lock_state *);
 extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
 
 extern const nfs4_stateid zero_stateid;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index af80b5981486..0ddc20102d46 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2626,14 +2626,11 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 	down_read(&clp->cl_sem);
 	nlo.clientid = clp->cl_clientid;
 	down(&state->lock_sema);
-	lsp = nfs4_find_lock_state(state, request->fl_owner);
-	if (lsp)
-		nlo.id = lsp->ls_id; 
-	else {
-		spin_lock(&clp->cl_lock);
-		nlo.id = nfs4_alloc_lockowner_id(clp);
-		spin_unlock(&clp->cl_lock);
-	}
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		goto out;
+	lsp = request->fl_u.nfs4_fl.owner;
+	nlo.id = lsp->ls_id; 
 	arg.u.lockt = &nlo;
 	status = rpc_call_sync(server->client, &msg, 0);
 	if (!status) {
@@ -2654,8 +2651,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 		request->fl_pid = 0;
 		status = 0;
 	}
-	if (lsp)
-		nfs4_put_lock_state(lsp);
+out:
 	up(&state->lock_sema);
 	up_read(&clp->cl_sem);
 	return status;
@@ -2715,28 +2711,26 @@ static int _nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock
 	};
 	struct nfs4_lock_state *lsp;
 	struct nfs_locku_opargs luargs;
-	int status = 0;
+	int status;
 			
 	down_read(&clp->cl_sem);
 	down(&state->lock_sema);
-	lsp = nfs4_find_lock_state(state, request->fl_owner);
-	if (!lsp)
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
 		goto out;
+	lsp = request->fl_u.nfs4_fl.owner;
 	/* We might have lost the locks! */
-	if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) {
-		luargs.seqid = lsp->ls_seqid;
-		memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
-		arg.u.locku = &luargs;
-		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
-		nfs4_increment_lock_seqid(status, lsp);
-	}
+	if ((lsp->ls_flags & NFS_LOCK_INITIALIZED) == 0)
+		goto out;
+	luargs.seqid = lsp->ls_seqid;
+	memcpy(&luargs.stateid, &lsp->ls_stateid, sizeof(luargs.stateid));
+	arg.u.locku = &luargs;
+	status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
+	nfs4_increment_lock_seqid(status, lsp);
 
-	if (status == 0) {
+	if (status == 0)
 		memcpy(&lsp->ls_stateid,  &res.u.stateid, 
 				sizeof(lsp->ls_stateid));
-		nfs4_notify_unlck(state, request, lsp);
-	}
-	nfs4_put_lock_state(lsp);
 out:
 	up(&state->lock_sema);
 	if (status == 0)
@@ -2762,7 +2756,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 {
 	struct inode *inode = state->inode;
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs4_lock_state *lsp;
+	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
 	struct nfs_lockargs arg = {
 		.fh = NFS_FH(inode),
 		.type = nfs4_lck_type(cmd, request),
@@ -2784,9 +2778,6 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 	};
 	int status;
 
-	lsp = nfs4_get_lock_state(state, request->fl_owner);
-	if (lsp == NULL)
-		return -ENOMEM;
 	if (!(lsp->ls_flags & NFS_LOCK_INITIALIZED)) {
 		struct nfs4_state_owner *owner = state->owner;
 		struct nfs_open_to_lock otl = {
@@ -2808,27 +2799,26 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *r
 		* seqid mutating errors */
 		nfs4_increment_seqid(status, owner);
 		up(&owner->so_sema);
+		if (status == 0) {
+			lsp->ls_flags |= NFS_LOCK_INITIALIZED;
+			lsp->ls_seqid++;
+		}
 	} else {
 		struct nfs_exist_lock el = {
 			.seqid = lsp->ls_seqid,
 		};
 		memcpy(&el.stateid, &lsp->ls_stateid, sizeof(el.stateid));
 		largs.u.exist_lock = &el;
-		largs.new_lock_owner = 0;
 		arg.u.lock = &largs;
 		status = rpc_call_sync(server->client, &msg, RPC_TASK_NOINTR);
+		/* increment seqid on success, and * seqid mutating errors*/
+		nfs4_increment_lock_seqid(status, lsp);
 	}
-	/* increment seqid on success, and * seqid mutating errors*/
-	nfs4_increment_lock_seqid(status, lsp);
 	/* save the returned stateid. */
-	if (status == 0) {
+	if (status == 0)
 		memcpy(&lsp->ls_stateid, &res.u.stateid, sizeof(nfs4_stateid));
-		lsp->ls_flags |= NFS_LOCK_INITIALIZED;
-		if (!reclaim)
-			nfs4_notify_setlk(state, request, lsp);
-	} else if (status == -NFS4ERR_DENIED)
+	else if (status == -NFS4ERR_DENIED)
 		status = -EAGAIN;
-	nfs4_put_lock_state(lsp);
 	return status;
 }
 
@@ -2869,7 +2859,9 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 
 	down_read(&clp->cl_sem);
 	down(&state->lock_sema);
-	status = _nfs4_do_setlk(state, cmd, request, 0);
+	status = nfs4_set_lock_state(state, request);
+	if (status == 0)
+		status = _nfs4_do_setlk(state, cmd, request, 0);
 	up(&state->lock_sema);
 	if (status == 0) {
 		/* Note: we always want to sleep here! */
@@ -2927,7 +2919,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 		if (signalled())
 			break;
 	} while(status < 0);
-
 	return status;
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 591ad1d51880..afe587d82f1e 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -360,7 +360,7 @@ nfs4_alloc_open_state(void)
 	atomic_set(&state->count, 1);
 	INIT_LIST_HEAD(&state->lock_states);
 	init_MUTEX(&state->lock_sema);
-	rwlock_init(&state->state_lock);
+	spin_lock_init(&state->state_lock);
 	return state;
 }
 
@@ -542,16 +542,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 	return NULL;
 }
 
-struct nfs4_lock_state *
-nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
-{
-	struct nfs4_lock_state *lsp;
-	read_lock(&state->state_lock);
-	lsp = __nfs4_find_lock_state(state, fl_owner);
-	read_unlock(&state->state_lock);
-	return lsp;
-}
-
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
@@ -568,14 +558,13 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 		return NULL;
 	lsp->ls_flags = 0;
 	lsp->ls_seqid = 0;	/* arbitrary */
-	lsp->ls_id = -1; 
 	memset(lsp->ls_stateid.data, 0, sizeof(lsp->ls_stateid.data));
 	atomic_set(&lsp->ls_count, 1);
 	lsp->ls_owner = fl_owner;
-	INIT_LIST_HEAD(&lsp->ls_locks);
 	spin_lock(&clp->cl_lock);
 	lsp->ls_id = nfs4_alloc_lockowner_id(clp);
 	spin_unlock(&clp->cl_lock);
+	INIT_LIST_HEAD(&lsp->ls_locks);
 	return lsp;
 }
 
@@ -585,121 +574,112 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
  *
  * The caller must be holding state->lock_sema and clp->cl_sem
  */
-struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
+static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
-	struct nfs4_lock_state * lsp;
+	struct nfs4_lock_state *lsp, *new = NULL;
 	
-	lsp = nfs4_find_lock_state(state, owner);
-	if (lsp == NULL)
-		lsp = nfs4_alloc_lock_state(state, owner);
+	for(;;) {
+		spin_lock(&state->state_lock);
+		lsp = __nfs4_find_lock_state(state, owner);
+		if (lsp != NULL)
+			break;
+		if (new != NULL) {
+			new->ls_state = state;
+			list_add(&new->ls_locks, &state->lock_states);
+			set_bit(LK_STATE_IN_USE, &state->flags);
+			lsp = new;
+			new = NULL;
+			break;
+		}
+		spin_unlock(&state->state_lock);
+		new = nfs4_alloc_lock_state(state, owner);
+		if (new == NULL)
+			return NULL;
+	}
+	spin_unlock(&state->state_lock);
+	kfree(new);
 	return lsp;
 }
 
 /*
- * Byte-range lock aware utility to initialize the stateid of read/write
- * requests.
+ * Release reference to lock_state, and free it if we see that
+ * it is no longer in use
  */
-void
-nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
+static void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 {
-	if (test_bit(LK_STATE_IN_USE, &state->flags)) {
-		struct nfs4_lock_state *lsp;
+	struct nfs4_state *state;
 
-		lsp = nfs4_find_lock_state(state, fl_owner);
-		if (lsp) {
-			memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
-			nfs4_put_lock_state(lsp);
-			return;
-		}
-	}
-	memcpy(dst, &state->stateid, sizeof(*dst));
+	if (lsp == NULL)
+		return;
+	state = lsp->ls_state;
+	if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock))
+		return;
+	list_del(&lsp->ls_locks);
+	if (list_empty(&state->lock_states))
+		clear_bit(LK_STATE_IN_USE, &state->flags);
+	spin_unlock(&state->state_lock);
+	kfree(lsp);
 }
 
-/*
-* Called with state->lock_sema and clp->cl_sem held.
-*/
-void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
+static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 {
-	if (status == NFS_OK || seqid_mutating_err(-status))
-		lsp->ls_seqid++;
-}
+	struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner;
 
-/* 
-* Check to see if the request lock (type FL_UNLK) effects the fl lock.
-*
-* fl and request must have the same posix owner
-*
-* return: 
-* 0 -> fl not effected by request
-* 1 -> fl consumed by request
-*/
+	dst->fl_u.nfs4_fl.owner = lsp;
+	atomic_inc(&lsp->ls_count);
+}
 
-static int
-nfs4_check_unlock(struct file_lock *fl, struct file_lock *request)
+static void nfs4_fl_release_lock(struct file_lock *fl)
 {
-	if (fl->fl_start >= request->fl_start && fl->fl_end <= request->fl_end)
-		return 1;
-	return 0;
+	nfs4_put_lock_state(fl->fl_u.nfs4_fl.owner);
 }
 
-/*
- * Post an initialized lock_state on the state->lock_states list.
- */
-void nfs4_notify_setlk(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+static struct file_lock_operations nfs4_fl_lock_ops = {
+	.fl_copy_lock = nfs4_fl_copy_lock,
+	.fl_release_private = nfs4_fl_release_lock,
+};
+
+int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 {
-	if (!list_empty(&lsp->ls_locks))
-		return;
-	atomic_inc(&lsp->ls_count);
-	write_lock(&state->state_lock);
-	list_add(&lsp->ls_locks, &state->lock_states);
-	set_bit(LK_STATE_IN_USE, &state->flags);
-	write_unlock(&state->state_lock);
+	struct nfs4_lock_state *lsp;
+
+	if (fl->fl_ops != NULL)
+		return 0;
+	lsp = nfs4_get_lock_state(state, fl->fl_owner);
+	if (lsp == NULL)
+		return -ENOMEM;
+	fl->fl_u.nfs4_fl.owner = lsp;
+	fl->fl_ops = &nfs4_fl_lock_ops;
+	return 0;
 }
 
-/* 
- * to decide to 'reap' lock state:
- * 1) search i_flock for file_locks with fl.lock_state = to ls.
- * 2) determine if unlock will consume found lock. 
- * 	if so, reap
- *
- * 	else, don't reap.
- *
+/*
+ * Byte-range lock aware utility to initialize the stateid of read/write
+ * requests.
  */
-void
-nfs4_notify_unlck(struct nfs4_state *state, struct file_lock *request, struct nfs4_lock_state *lsp)
+void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner)
 {
-	struct inode *inode = state->inode;
-	struct file_lock *fl;
+	struct nfs4_lock_state *lsp;
 
-	for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
-		if (!(fl->fl_flags & FL_POSIX))
-			continue;
-		if (fl->fl_owner != lsp->ls_owner)
-			continue;
-		/* Exit if we find at least one lock which is not consumed */
-		if (nfs4_check_unlock(fl,request) == 0)
-			return;
-	}
+	memcpy(dst, &state->stateid, sizeof(*dst));
+	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
+		return;
 
-	write_lock(&state->state_lock);
-	list_del_init(&lsp->ls_locks);
-	if (list_empty(&state->lock_states))
-		clear_bit(LK_STATE_IN_USE, &state->flags);
-	write_unlock(&state->state_lock);
+	spin_lock(&state->state_lock);
+	lsp = __nfs4_find_lock_state(state, fl_owner);
+	if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
+		memcpy(dst, &lsp->ls_stateid, sizeof(*dst));
+	spin_unlock(&state->state_lock);
 	nfs4_put_lock_state(lsp);
 }
 
 /*
- * Release reference to lock_state, and free it if we see that
- * it is no longer in use
- */
-void
-nfs4_put_lock_state(struct nfs4_lock_state *lsp)
+* Called with state->lock_sema and clp->cl_sem held.
+*/
+void nfs4_increment_lock_seqid(int status, struct nfs4_lock_state *lsp)
 {
-	if (!atomic_dec_and_test(&lsp->ls_count))
-		return;
-	BUG_ON (!list_empty(&lsp->ls_locks));
-	kfree(lsp);
+	if (status == NFS_OK || seqid_mutating_err(-status))
+		lsp->ls_seqid++;
 }
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b8b696d4f15..e5a8db00df29 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -674,6 +674,7 @@ struct file_lock {
 	struct lock_manager_operations *fl_lmops;	/* Callbacks for lockmanagers */
 	union {
 		struct nfs_lock_info	nfs_fl;
+		struct nfs4_lock_info	nfs4_fl;
 	} fl_u;
 };
 
diff --git a/include/linux/nfs_fs_i.h b/include/linux/nfs_fs_i.h
index e9a749588a7b..e2c18dabff86 100644
--- a/include/linux/nfs_fs_i.h
+++ b/include/linux/nfs_fs_i.h
@@ -16,6 +16,11 @@ struct nfs_lock_info {
 	struct nlm_lockowner *owner;
 };
 
+struct nfs4_lock_state;
+struct nfs4_lock_info {
+	struct nfs4_lock_state *owner;
+};
+
 /*
  * Lock flag values
  */
-- 
cgit v1.2.3-59-g8ed1b


From 6ca4f65e6b390d09e1de7280cf9fd4f5d8e4b48b Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:04:55 -0700
Subject: [NETPOLL]: Set poll_owner to -1 before unlocking in
 netpoll_poll_unlock()

This trivial patch moves the assignment of poll_owner to -1 inside of
the lock.  This fixes a potential SMP race in the code.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index c0d8b90c5202..449a4fde6587 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -53,8 +53,8 @@ static inline void netpoll_poll_lock(struct net_device *dev)
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
 	if (dev->np) {
-		spin_unlock(&dev->np->poll_lock);
 		dev->np->poll_owner = -1;
+		spin_unlock(&dev->np->poll_lock);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 115c1d6e61b70851d9a363328c3b8d4c2559a1d3 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:31 -0700
Subject: [NETPOLL]: Introduce a netpoll_info struct

This patch introduces a netpoll_info structure, which the struct net_device
will now point to instead of pointing to a struct netpoll.  The reason for
this is two-fold: 1) fields such as the rx_flags, poll_owner, and poll_lock
should be maintained per net_device, not per netpoll;  and 2) this is a first
step in providing support for multiple netpoll clients to register against the
same net_device.

The struct netpoll is now pointed to by the netpoll_info structure.  As
such, the previous behaviour of the code is preserved.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 ++--
 include/linux/netpoll.h   | 25 ++++++++++++++-------
 net/core/netpoll.c        | 57 +++++++++++++++++++++++++++++++----------------
 3 files changed, 57 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ba5d1236aa17..d6afd440cf7b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -41,7 +41,7 @@
 struct divert_blk;
 struct vlan_group;
 struct ethtool_ops;
-struct netpoll;
+struct netpoll_info;
 					/* source back-compat hooks */
 #define SET_ETHTOOL_OPS(netdev,ops) \
 	( (netdev)->ethtool_ops = (ops) )
@@ -468,7 +468,7 @@ struct net_device
 						     unsigned char *haddr);
 	int			(*neigh_setup)(struct net_device *dev, struct neigh_parms *);
 #ifdef CONFIG_NETPOLL
-	struct netpoll		*np;
+	struct netpoll_info	*npinfo;
 #endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*poll_controller)(struct net_device *dev);
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 449a4fde6587..388cd91bc7a6 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -16,14 +16,18 @@ struct netpoll;
 struct netpoll {
 	struct net_device *dev;
 	char dev_name[16], *name;
-	int rx_flags;
 	void (*rx_hook)(struct netpoll *, int, char *, int);
 	void (*drop)(struct sk_buff *skb);
 	u32 local_ip, remote_ip;
 	u16 local_port, remote_port;
 	unsigned char local_mac[6], remote_mac[6];
+};
+
+struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
+	int rx_flags;
+	struct netpoll *np;
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -39,22 +43,27 @@ void netpoll_queue(struct sk_buff *skb);
 #ifdef CONFIG_NETPOLL
 static inline int netpoll_rx(struct sk_buff *skb)
 {
-	return skb->dev->np && skb->dev->np->rx_flags && __netpoll_rx(skb);
+	struct netpoll_info *npinfo = skb->dev->npinfo;
+
+	if (!npinfo || !npinfo->rx_flags)
+		return 0;
+
+	return npinfo->np && __netpoll_rx(skb);
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
 {
-	if (dev->np) {
-		spin_lock(&dev->np->poll_lock);
-		dev->np->poll_owner = smp_processor_id();
+	if (dev->npinfo) {
+		spin_lock(&dev->npinfo->poll_lock);
+		dev->npinfo->poll_owner = smp_processor_id();
 	}
 }
 
 static inline void netpoll_poll_unlock(struct net_device *dev)
 {
-	if (dev->np) {
-		dev->np->poll_owner = -1;
-		spin_unlock(&dev->np->poll_lock);
+	if (dev->npinfo) {
+		dev->npinfo->poll_owner = -1;
+		spin_unlock(&dev->npinfo->poll_lock);
 	}
 }
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a119696d5521..ab3c0c9713b0 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -130,19 +130,20 @@ static int checksum_udp(struct sk_buff *skb, struct udphdr *uh,
  */
 static void poll_napi(struct netpoll *np)
 {
+	struct netpoll_info *npinfo = np->dev->npinfo;
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    np->poll_owner != smp_processor_id() &&
-	    spin_trylock(&np->poll_lock)) {
-		np->rx_flags |= NETPOLL_RX_DROP;
+	    npinfo->poll_owner != smp_processor_id() &&
+	    spin_trylock(&npinfo->poll_lock)) {
+		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
 
 		np->dev->poll(np->dev, &budget);
 
 		atomic_dec(&trapped);
-		np->rx_flags &= ~NETPOLL_RX_DROP;
-		spin_unlock(&np->poll_lock);
+		npinfo->rx_flags &= ~NETPOLL_RX_DROP;
+		spin_unlock(&npinfo->poll_lock);
 	}
 }
 
@@ -245,6 +246,7 @@ repeat:
 static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 {
 	int status;
+	struct netpoll_info *npinfo;
 
 repeat:
 	if(!np || !np->dev || !netif_running(np->dev)) {
@@ -253,8 +255,9 @@ repeat:
 	}
 
 	/* avoid recursion */
-	if(np->poll_owner == smp_processor_id() ||
-	   np->dev->xmit_lock_owner == smp_processor_id()) {
+	npinfo = np->dev->npinfo;
+	if (npinfo->poll_owner == smp_processor_id() ||
+	    np->dev->xmit_lock_owner == smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -341,14 +344,18 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
 
 static void arp_reply(struct sk_buff *skb)
 {
+	struct netpoll_info *npinfo = skb->dev->npinfo;
 	struct arphdr *arp;
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
 	struct sk_buff *send_skb;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = NULL;
 
-	if (!np) return;
+	if (npinfo)
+		np = npinfo->np;
+	if (!np)
+		return;
 
 	/* No arp on this interface */
 	if (skb->dev->flags & IFF_NOARP)
@@ -429,7 +436,7 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->np;
+	struct netpoll *np = skb->dev->npinfo->np;
 
 	if (!np->rx_hook)
 		goto out;
@@ -611,9 +618,7 @@ int netpoll_setup(struct netpoll *np)
 {
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
-
-	np->poll_lock = SPIN_LOCK_UNLOCKED;
-	np->poll_owner = -1;
+	struct netpoll_info *npinfo;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -624,7 +629,16 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	np->dev = ndev;
-	ndev->np = np;
+	if (!ndev->npinfo) {
+		npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+		if (!npinfo)
+			goto release;
+
+		npinfo->np = NULL;
+		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
+		npinfo->poll_owner = -1;
+	} else
+		npinfo = ndev->npinfo;
 
 	if (!ndev->poll_controller) {
 		printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
@@ -693,12 +707,15 @@ int netpoll_setup(struct netpoll *np)
 	}
 
 	if(np->rx_hook)
-		np->rx_flags = NETPOLL_RX_ENABLED;
+		npinfo->rx_flags = NETPOLL_RX_ENABLED;
+	npinfo->np = np;
+	ndev->npinfo = npinfo;
 
 	return 0;
 
  release:
-	ndev->np = NULL;
+	if (!ndev->npinfo)
+		kfree(npinfo);
 	np->dev = NULL;
 	dev_put(ndev);
 	return -1;
@@ -706,9 +723,11 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
-	if (np->dev)
-		np->dev->np = NULL;
-	dev_put(np->dev);
+	if (np->dev) {
+		if (np->dev->npinfo)
+			np->dev->npinfo->np = NULL;
+		dev_put(np->dev);
+	}
 	np->dev = NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From fbeec2e1552949002065435c9829dc244ad85407 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Jun 2005 22:05:59 -0700
Subject: [NETPOLL]: allow multiple netpoll_clients to register against one
 interface

This patch provides support for registering multiple netpoll clients to the
same network device.  Only one of these clients may register an rx_hook,
however.  In practice, this restriction has not been problematic.  It is
worth mentioning, though, that the current design can be easily extended to
allow for the registration of multiple rx_hooks.

The basic idea of the patch is that the rx_np pointer in the netpoll_info
structure points to the struct netpoll that has rx_hook filled in.  Aside
from this one case, there is no need for a pointer from the struct
net_device to an individual struct netpoll.

A lock is introduced to protect the setting and clearing of the np_rx
pointer.  The pointer will only be cleared upon netpoll client module
removal, and the lock should be uncontested.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 15 ++++++++++++---
 net/core/netpoll.c      | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 41 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 388cd91bc7a6..bcd0ac33f592 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -27,7 +27,8 @@ struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
 	int rx_flags;
-	struct netpoll *np;
+	spinlock_t rx_lock;
+	struct netpoll *rx_np; /* netpoll that registered an rx_hook */
 };
 
 void netpoll_poll(struct netpoll *np);
@@ -44,11 +45,19 @@ void netpoll_queue(struct sk_buff *skb);
 static inline int netpoll_rx(struct sk_buff *skb)
 {
 	struct netpoll_info *npinfo = skb->dev->npinfo;
+	unsigned long flags;
+	int ret = 0;
 
-	if (!npinfo || !npinfo->rx_flags)
+	if (!npinfo || (!npinfo->rx_np && !npinfo->rx_flags))
 		return 0;
 
-	return npinfo->np && __netpoll_rx(skb);
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	/* check rx_flags again with the lock held */
+	if (npinfo->rx_flags && __netpoll_rx(skb))
+		ret = 1;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
+	return ret;
 }
 
 static inline void netpoll_poll_lock(struct net_device *dev)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index ab3c0c9713b0..c327c9edadc5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -349,11 +349,15 @@ static void arp_reply(struct sk_buff *skb)
 	unsigned char *arp_ptr;
 	int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
 	u32 sip, tip;
+	unsigned long flags;
 	struct sk_buff *send_skb;
 	struct netpoll *np = NULL;
 
-	if (npinfo)
-		np = npinfo->np;
+	spin_lock_irqsave(&npinfo->rx_lock, flags);
+	if (npinfo->rx_np && npinfo->rx_np->dev == skb->dev)
+		np = npinfo->rx_np;
+	spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+
 	if (!np)
 		return;
 
@@ -436,9 +440,9 @@ int __netpoll_rx(struct sk_buff *skb)
 	int proto, len, ulen;
 	struct iphdr *iph;
 	struct udphdr *uh;
-	struct netpoll *np = skb->dev->npinfo->np;
+	struct netpoll *np = skb->dev->npinfo->rx_np;
 
-	if (!np->rx_hook)
+	if (!np)
 		goto out;
 	if (skb->dev->type != ARPHRD_ETHER)
 		goto out;
@@ -619,6 +623,7 @@ int netpoll_setup(struct netpoll *np)
 	struct net_device *ndev = NULL;
 	struct in_device *in_dev;
 	struct netpoll_info *npinfo;
+	unsigned long flags;
 
 	if (np->dev_name)
 		ndev = dev_get_by_name(np->dev_name);
@@ -634,9 +639,10 @@ int netpoll_setup(struct netpoll *np)
 		if (!npinfo)
 			goto release;
 
-		npinfo->np = NULL;
+		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
+		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
 	} else
 		npinfo = ndev->npinfo;
 
@@ -706,9 +712,13 @@ int netpoll_setup(struct netpoll *np)
 		       np->name, HIPQUAD(np->local_ip));
 	}
 
-	if(np->rx_hook)
-		npinfo->rx_flags = NETPOLL_RX_ENABLED;
-	npinfo->np = np;
+	if (np->rx_hook) {
+		spin_lock_irqsave(&npinfo->rx_lock, flags);
+		npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+		npinfo->rx_np = np;
+		spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+	}
+	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
 	return 0;
@@ -723,11 +733,20 @@ int netpoll_setup(struct netpoll *np)
 
 void netpoll_cleanup(struct netpoll *np)
 {
+	struct netpoll_info *npinfo;
+	unsigned long flags;
+
 	if (np->dev) {
-		if (np->dev->npinfo)
-			np->dev->npinfo->np = NULL;
+		npinfo = np->dev->npinfo;
+		if (npinfo && npinfo->rx_np == np) {
+			spin_lock_irqsave(&npinfo->rx_lock, flags);
+			npinfo->rx_np = NULL;
+			npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+			spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+		}
 		dev_put(np->dev);
 	}
+
 	np->dev = NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From cb65d506c34c86df5bcef939ce5a8666a451bd8b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:15:01 -0700
Subject: [X25]: Selective sub-address matching with call user data.

From: Shaun Pereira <spereira@tusc.com.au>

This is the first (independent of the second) patch of two that I am
working on with x25 on linux (tested with xot on a cisco router).  Details
are as follows.

Current state of module:

A server using the current implementation (2.6.11.7) of the x25 module will
accept a call request/ incoming call packet at the listening x.25 address,
from all callers to that address, as long as NO call user data is present
in the packet header.

If the server needs to choose to accept a particular call request/ incoming
call packet arriving at its listening x25 address, then the kernel has to
allow a match of call user data present in the call request packet with its
own.  This is required when multiple servers listen at the same x25 address
and device interface.  The kernel currently matches ALL call user data, if
present.

Current Changes:

This patch is a follow up to the patch submitted previously by Andrew
Hendry, and allows the user to selectively control the number of octets of
call user data in the call request packet, that the kernel will match.  By
default no call user data is matched, even if call user data is present.
To allow call user data matching, a cudmatchlength > 0 has to be passed
into the kernel after which the passed number of octets will be matched.
Otherwise the kernel behavior is exactly as the original implementation.

This patch also ensures that as is normally the case, no call user data
will be present in the Call accepted / call connected packet sent back to
the caller

Future Changes on next patch:

There are cases however when call user data may be present in the call
accepted packet.  According to the X.25 recommendation (ITU-T 10/96)
section 5.2.3.2 call user data may be present in the call accepted packet
provided the fast select facility is used.  My next patch will include this
fast select utility and the ability to send up to 128 octets call user data
in the call accepted packet provided the fast select facility is used.  I
am currently testing this, again with xot on linux and cisco.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>

(With a fix from Alexey Dobriyan <adobriyan@gmail.com>)
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h | 10 ++++++++
 include/net/x25.h   |  3 +--
 net/x25/af_x25.c    | 73 +++++++++++++++++++++++++++++++++++------------------
 net/x25/x25_subr.c  | 18 -------------
 4 files changed, 59 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 7531cfed5885..6f43b3d20248 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -4,6 +4,8 @@
  * 	History
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/02/05	Shaun Pereira Selective sub address matching with
+ *					call user data
  */
 
 #ifndef	X25_KERNEL_H
@@ -16,6 +18,7 @@
 #define	SIOCX25GCALLUSERDATA	(SIOCPROTOPRIVATE + 4)
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
+#define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
 
 /*
  *	Values for {get,set}sockopt.
@@ -109,4 +112,11 @@ struct x25_causediag {
 	unsigned char	diagnostic;
 };
 
+/*
+ *	Further optional call user data match length selection
+ */
+struct x25_subaddr {
+	unsigned int cudmatchlength;
+};
+
 #endif
diff --git a/include/net/x25.h b/include/net/x25.h
index 7a1ba5bbb868..9dd70dd4a9b7 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -134,7 +134,7 @@ struct x25_sock {
 	struct sock		sk;
 	struct x25_address	source_addr, dest_addr;
 	struct x25_neigh	*neighbour;
-	unsigned int		lci;
+	unsigned int		lci, cudmatchlength;
 	unsigned char		state, condition, qbitincl, intflag;
 	unsigned short		vs, vr, va, vl;
 	unsigned long		t2, t21, t22, t23;
@@ -242,7 +242,6 @@ extern int  x25_validate_nr(struct sock *, unsigned short);
 extern void x25_write_internal(struct sock *, int);
 extern int  x25_decode(struct sock *, struct sk_buff *, int *, int *, int *, int *, int *);
 extern void x25_disconnect(struct sock *, int, unsigned char, unsigned char);
-extern int x25_check_calluserdata(struct x25_calluserdata *,struct x25_calluserdata *);
 
 /* x25_timer.c */
 extern void x25_start_heartbeat(struct sock *);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 2a24b243b841..e17d84a55d5e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -29,6 +29,8 @@
  *	2000-11-14	Henner Eisen    Closing datalink from NETDEV_GOING_DOWN
  *	2002-10-06	Arnaldo C. Melo Get rid of cli/sti, move proc stuff to
  *					x25_proc.c, using seq_file
+ *	2005-04-02	Shaun Pereira	Selective sub address matching
+ *					with call user data
  */
 
 #include <linux/config.h>
@@ -219,7 +221,8 @@ static void x25_insert_socket(struct sock *sk)
  *	Note: if a listening socket has cud set it must only get calls
  *	with matching cud.
  */
-static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata)
+static struct sock *x25_find_listener(struct x25_address *addr,
+					struct sk_buff *skb)
 {
 	struct sock *s;
 	struct sock *next_best;
@@ -230,22 +233,23 @@ static struct sock *x25_find_listener(struct x25_address *addr, struct x25_callu
 
 	sk_for_each(s, node, &x25_list)
 		if ((!strcmp(addr->x25_addr,
-			     x25_sk(s)->source_addr.x25_addr) ||
-		     !strcmp(addr->x25_addr,
-			     null_x25_address.x25_addr)) &&
-		     s->sk_state == TCP_LISTEN) {
-
+			x25_sk(s)->source_addr.x25_addr) ||
+				!strcmp(addr->x25_addr,
+					null_x25_address.x25_addr)) &&
+					s->sk_state == TCP_LISTEN) {
 			/*
 			 * Found a listening socket, now check the incoming
 			 * call user data vs this sockets call user data
 			 */
-			if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) {
-				sock_hold(s);
-				goto found;
-			}
-			if (x25_sk(s)->calluserdata.cudlength == 0) {
+			if(skb->len > 0 && x25_sk(s)->cudmatchlength > 0) {
+			 	if((memcmp(x25_sk(s)->calluserdata.cuddata,
+			 		skb->data,
+					x25_sk(s)->cudmatchlength)) == 0) {
+					sock_hold(s);
+					goto found;
+				 }
+			} else
 				next_best = s;
-			}
 		}
 	if (next_best) {
 		s = next_best;
@@ -497,6 +501,7 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t23   = sysctl_x25_clear_request_timeout;
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
+	x25->cudmatchlength = 0;
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -545,6 +550,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->t2         = ox25->t2;
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
+	x25->cudmatchlength = ox25->cudmatchlength;
 
 	x25_init_timers(sk);
 out:
@@ -822,7 +828,6 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	struct x25_sock *makex25;
 	struct x25_address source_addr, dest_addr;
 	struct x25_facilities facilities;
-	struct x25_calluserdata calluserdata;
 	int len, rc;
 
 	/*
@@ -844,20 +849,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	len = skb->data[0] + 1;
 	skb_pull(skb,len);
 
-	/*
-	 *	Incoming Call User Data.
-	 */
-	if (skb->len >= 0) {
-		memcpy(calluserdata.cuddata, skb->data, skb->len);
-		calluserdata.cudlength = skb->len;
-	}
-
-	skb_push(skb,len);
-
 	/*
 	 *	Find a listener for the particular address/cud pair.
 	 */
-	sk = x25_find_listener(&source_addr,&calluserdata);
+	sk = x25_find_listener(&source_addr,skb);
+	skb_push(skb,len);
 
 	/*
 	 *	We can't accept the Call Request.
@@ -900,12 +896,22 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->neighbour     = nb;
 	makex25->facilities    = facilities;
 	makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask;
-	makex25->calluserdata  = calluserdata;
+	/* ensure no reverse facil on accept */
+	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
+	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
 	x25_write_internal(make, X25_CALL_ACCEPTED);
 
 	makex25->state = X25_STATE_3;
 
+	/*
+	 *	Incoming Call User Data.
+	 */
+	if (skb->len >= 0) {
+		memcpy(makex25->calluserdata.cuddata, skb->data, skb->len);
+		makex25->calluserdata.cudlength = skb->len;
+	}
+
 	sk->sk_ack_backlog++;
 
 	x25_insert_socket(make);
@@ -1325,6 +1331,23 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25SCUDMATCHLEN: {
+			struct x25_subaddr sub_addr;
+			rc = -EINVAL;
+			if(sk->sk_state != TCP_CLOSE)
+				break;
+			rc = -EFAULT;
+			if (copy_from_user(&sub_addr, argp,
+					sizeof(sub_addr)))
+				break;
+		 	rc = -EINVAL;
+			if(sub_addr.cudmatchlength > X25_MAX_CUD_LEN)
+				break;
+			x25->cudmatchlength = sub_addr.cudmatchlength;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 183fea3bba67..c349bbd61684 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -354,21 +354,3 @@ void x25_check_rbuf(struct sock *sk)
 	}
 }
 
-/*
- * Compare 2 calluserdata structures, used to find correct listening sockets
- * when call user data is used.
- */
-int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs)
-{
-	int i;
-	if (ours->cudlength != theirs->cudlength)
-		return 0;
-
-	for (i=0;i<ours->cudlength;i++) {
-		if (ours->cuddata[i] != theirs->cuddata[i]) {
-			return 0;
-		}
-	}
-	return 1;
-}
-
-- 
cgit v1.2.3-59-g8ed1b


From ebc3f64b864fc16a594c2e63bf55a55c7d42084b Mon Sep 17 00:00:00 2001
From: Shaun Pereira <spereira@tusc.com.au>
Date: Wed, 22 Jun 2005 22:16:17 -0700
Subject: [X25]: Fast select with no restriction on response

This patch is a follow up to patch 1 regarding "Selective Sub Address
matching with call user data".  It allows use of the Fast-Select-Acceptance
optional user facility for X.25.

This patch just implements fast select with no restriction on response
(NRR).  What this means (according to ITU-T Recomendation 10/96 section
6.16) is that if in an incoming call packet, the relevant facility bits are
set for fast-select-NRR, then the called DTE can issue a direct response to
the incoming packet using a call-accepted packet that contains
call-user-data.  This patch allows such a response.

The called DTE can also respond with a clear-request packet that contains
call-user-data.  However, this feature is currently not implemented by the
patch.

How is Fast Select Acceptance used?
By default, the system does not allow fast select acceptance (as before).
To enable a response to fast select acceptance,
After a listen socket in created and bound as follows
	socket(AF_X25, SOCK_SEQPACKET, 0);
	bind(call_soc, (struct sockaddr *)&locl_addr, sizeof(locl_addr));
but before a listen system call is made, the following ioctl should be used.
	ioctl(call_soc,SIOCX25CALLACCPTAPPRV);
Now the listen system call can be made
	listen(call_soc, 4);
After this, an incoming-call packet will be accepted, but no call-accepted
packet will be sent back until the following system call is made on the socket
that accepts the call
	ioctl(vc_soc,SIOCX25SENDCALLACCPT);
The network (or cisco xot router used for testing here) will allow the
application server's call-user-data in the call-accepted packet,
provided the call-request was made with Fast-select NRR.

Signed-off-by: Shaun Pereira <spereira@tusc.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/x25.h      |  2 ++
 include/net/x25.h        |  6 ++++--
 net/x25/af_x25.c         | 37 +++++++++++++++++++++++++++++++++----
 net/x25/x25_facilities.c | 34 +++++++++++++++++++++++++++++-----
 net/x25/x25_subr.c       | 23 ++++++++++++++++++-----
 5 files changed, 86 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/x25.h b/include/linux/x25.h
index 6f43b3d20248..16d44931afa0 100644
--- a/include/linux/x25.h
+++ b/include/linux/x25.h
@@ -19,6 +19,8 @@
 #define	SIOCX25SCALLUSERDATA	(SIOCPROTOPRIVATE + 5)
 #define	SIOCX25GCAUSEDIAG	(SIOCPROTOPRIVATE + 6)
 #define SIOCX25SCUDMATCHLEN	(SIOCPROTOPRIVATE + 7)
+#define SIOCX25CALLACCPTAPPRV   (SIOCPROTOPRIVATE + 8)
+#define SIOCX25SENDCALLACCPT    (SIOCPROTOPRIVATE + 9)
 
 /*
  *	Values for {get,set}sockopt.
diff --git a/include/net/x25.h b/include/net/x25.h
index 9dd70dd4a9b7..8b39b98876e8 100644
--- a/include/net/x25.h
+++ b/include/net/x25.h
@@ -79,6 +79,8 @@ enum {
 #define	X25_DEFAULT_PACKET_SIZE	X25_PS128		/* Default Packet Size */
 #define	X25_DEFAULT_THROUGHPUT	0x0A			/* Deafult Throughput */
 #define	X25_DEFAULT_REVERSE	0x00			/* Default Reverse Charging */
+#define X25_DENY_ACCPT_APPRV   0x01			/* Default value */
+#define X25_ALLOW_ACCPT_APPRV  0x00			/* Control enabled */
 
 #define X25_SMODULUS 		8
 #define	X25_EMODULUS		128
@@ -94,7 +96,7 @@ enum {
 #define	X25_FAC_CLASS_C		0x80
 #define	X25_FAC_CLASS_D		0xC0
 
-#define	X25_FAC_REVERSE		0x01
+#define	X25_FAC_REVERSE		0x01			/* also fast select */
 #define	X25_FAC_THROUGHPUT	0x02
 #define	X25_FAC_PACKET_SIZE	0x42
 #define	X25_FAC_WINDOW_SIZE	0x43
@@ -135,7 +137,7 @@ struct x25_sock {
 	struct x25_address	source_addr, dest_addr;
 	struct x25_neigh	*neighbour;
 	unsigned int		lci, cudmatchlength;
-	unsigned char		state, condition, qbitincl, intflag;
+	unsigned char		state, condition, qbitincl, intflag, accptapprv;
 	unsigned short		vs, vr, va, vl;
 	unsigned long		t2, t21, t22, t23;
 	unsigned short		fraglen;
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index e17d84a55d5e..04bec047fa9a 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -31,6 +31,8 @@
  *					x25_proc.c, using seq_file
  *	2005-04-02	Shaun Pereira	Selective sub address matching
  *					with call user data
+ *	2005-04-15	Shaun Pereira	Fast select with no restriction on
+ *					response
  */
 
 #include <linux/config.h>
@@ -502,6 +504,8 @@ static int x25_create(struct socket *sock, int protocol)
 	x25->t2    = sysctl_x25_ack_holdback_timeout;
 	x25->state = X25_STATE_0;
 	x25->cudmatchlength = 0;
+	x25->accptapprv = X25_DENY_ACCPT_APPRV;		/* normally no cud  */
+							/* on call accept   */
 
 	x25->facilities.winsize_in  = X25_DEFAULT_WINDOW_SIZE;
 	x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE;
@@ -551,6 +555,7 @@ static struct sock *x25_make_new(struct sock *osk)
 	x25->facilities = ox25->facilities;
 	x25->qbitincl   = ox25->qbitincl;
 	x25->cudmatchlength = ox25->cudmatchlength;
+	x25->accptapprv = ox25->accptapprv;
 
 	x25_init_timers(sk);
 out:
@@ -900,9 +905,11 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
 	makex25->vc_facil_mask &= ~X25_MASK_REVERSE;
 	makex25->cudmatchlength = x25_sk(sk)->cudmatchlength;
 
-	x25_write_internal(make, X25_CALL_ACCEPTED);
-
-	makex25->state = X25_STATE_3;
+	/* Normally all calls are accepted immediatly */
+	if(makex25->accptapprv & X25_DENY_ACCPT_APPRV) {
+		x25_write_internal(make, X25_CALL_ACCEPTED);
+		makex25->state = X25_STATE_3;
+	}
 
 	/*
 	 *	Incoming Call User Data.
@@ -1294,7 +1301,8 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			if (facilities.throughput < 0x03 ||
 			    facilities.throughput > 0xDD)
 				break;
-			if (facilities.reverse && facilities.reverse != 1)
+			if (facilities.reverse &&
+				(facilities.reverse | 0x81)!= 0x81)
 				break;
 			x25->facilities = facilities;
 			rc = 0;
@@ -1348,6 +1356,27 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 			break;
 		}
 
+		case SIOCX25CALLACCPTAPPRV: {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_CLOSE)
+				break;
+			x25->accptapprv = X25_ALLOW_ACCPT_APPRV;
+			rc = 0;
+			break;
+		}
+
+		case SIOCX25SENDCALLACCPT:  {
+			rc = -EINVAL;
+			if (sk->sk_state != TCP_ESTABLISHED)
+				break;
+			if (x25->accptapprv)	/* must call accptapprv above */
+				break;
+			x25_write_internal(sk, X25_CALL_ACCEPTED);
+			x25->state = X25_STATE_3;
+			rc = 0;
+			break;
+		}
+
  		default:
 			rc = dev_ioctl(cmd, argp);
 			break;
diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c
index a21bdb95f9a8..54278b962f4c 100644
--- a/net/x25/x25_facilities.c
+++ b/net/x25/x25_facilities.c
@@ -17,6 +17,8 @@
  *	X.25 001	Split from x25_subr.c
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities 
  *					  negotiation.
+ *	apr/14/05	Shaun Pereira - Allow fast select with no restriction
+ *					on response.
  */
 
 #include <linux/kernel.h>
@@ -43,9 +45,31 @@ int x25_parse_facilities(struct sk_buff *skb,
 		case X25_FAC_CLASS_A:
 			switch (*p) {
 			case X25_FAC_REVERSE:
-				facilities->reverse = p[1] & 0x01;
-				*vc_fac_mask |= X25_MASK_REVERSE;
-				break;
+				if((p[1] & 0x81) == 0x81) {
+					facilities->reverse = p[1] & 0x81;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x01) == 0x01) {
+					facilities->reverse = p[1] & 0x01;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if((p[1] & 0x80) == 0x80) {
+					facilities->reverse = p[1] & 0x80;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
+				if(p[1] == 0x00) {
+					facilities->reverse
+						= X25_DEFAULT_REVERSE;
+					*vc_fac_mask |= X25_MASK_REVERSE;
+					break;
+				}
+
 			case X25_FAC_THROUGHPUT:
 				facilities->throughput = p[1];
 				*vc_fac_mask |= X25_MASK_THROUGHPUT;
@@ -122,7 +146,7 @@ int x25_create_facilities(unsigned char *buffer,
 
 	if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) {
 		*p++ = X25_FAC_REVERSE;
-		*p++ = !!facilities->reverse;
+		*p++ = facilities->reverse;
 	}
 
 	if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) {
@@ -171,7 +195,7 @@ int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk,
 	/*
 	 *	They want reverse charging, we won't accept it.
 	 */
-	if (theirs.reverse && ours->reverse) {
+	if ((theirs.reverse & 0x01 ) && (ours->reverse & 0x01)) {
 		SOCK_DEBUG(sk, "X.25: rejecting reverse charging request");
 		return -1;
 	}
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index c349bbd61684..7fd872ad0c20 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -19,6 +19,8 @@
  *	mar/20/00	Daniela Squassoni Disabling/enabling of facilities
  *					  negotiation.
  *	jun/24/01	Arnaldo C. Melo	  use skb_queue_purge, cleanups
+ *	apr/04/15	Shaun Pereira		Fast select with no
+ *						restriction on response.
  */
 
 #include <linux/kernel.h>
@@ -127,8 +129,12 @@ void x25_write_internal(struct sock *sk, int frametype)
 			len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN +
 			       X25_MAX_CUD_LEN;
 			break;
-		case X25_CALL_ACCEPTED:
-			len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+		case X25_CALL_ACCEPTED: /* fast sel with no restr on resp */
+			if(x25->facilities.reverse & 0x80) {
+				len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN;
+			} else {
+				len += 1 + X25_MAX_FAC_LEN;
+			}
 			break;
 		case X25_CLEAR_REQUEST:
 		case X25_RESET_REQUEST:
@@ -203,9 +209,16 @@ void x25_write_internal(struct sock *sk, int frametype)
 							x25->vc_facil_mask);
 			dptr    = skb_put(skb, len);
 			memcpy(dptr, facilities, len);
-			dptr = skb_put(skb, x25->calluserdata.cudlength);
-			memcpy(dptr, x25->calluserdata.cuddata,
-			       x25->calluserdata.cudlength);
+
+			/* fast select with no restriction on response
+				allows call user data. Userland must
+				ensure it is ours and not theirs */
+			if(x25->facilities.reverse & 0x80) {
+				dptr = skb_put(skb,
+					x25->calluserdata.cudlength);
+				memcpy(dptr, x25->calluserdata.cuddata,
+				       x25->calluserdata.cudlength);
+			}
 			x25->calluserdata.cudlength = 0;
 			break;
 
-- 
cgit v1.2.3-59-g8ed1b


From 408fde81c1bff15c875a3618481e93a01dcc79ea Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:37 -0700
Subject: [PATCH] remove non-DISCONTIG use of pgdat->node_mem_map

This patch effectively eliminates direct use of pgdat->node_mem_map outside
of the DISCONTIG code.  On a flat memory system, these fields aren't
currently used, neither are they on a sparsemem system.

There was also a node_mem_map(nid) macro on many architectures.  Its use
along with the use of ->node_mem_map itself was not consistent.  It has
been removed in favor of two new, more explicit, arch-independent macros:

	pgdat_page_nr(pgdat, pagenr)
	nid_page_nr(nid, pagenr)

I called them "pgdat" and "nid" because we overload the term "node" to mean
"NUMA node", "DISCONTIG node" or "pg_data_t" in very confusing ways.  I
believe the newer names are much clearer.

These macros can be overridden in the sparsemem case with a theoretically
slower operation using node_start_pfn and pfn_to_page(), instead.  We could
make this the only behavior if people want, but I don't want to change too
much at once.  One thing at a time.

This patch removes more code than it adds.

Compile tested on alpha, alpha discontig, arm, arm-discontig, i386, i386
generic, NUMAQ, Summit, ppc64, ppc64 discontig, and x86_64.  Full list
here: http://sr71.net/patches/2.6.12/2.6.12-rc1-mhp2/configs/

Boot tested on NUMAQ, x86 SMP and ppc64 power4/5 LPARs.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin J. Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/numa.c             | 16 +++++++---------
 arch/i386/mm/pgtable.c           |  2 +-
 arch/ia64/mm/discontig.c         |  9 +++++----
 arch/m32r/mm/init.c              |  4 ++--
 arch/mips/sgi-ip27/ip27-memory.c |  5 ++---
 arch/parisc/mm/init.c            |  2 +-
 arch/ppc64/mm/init.c             |  4 ++--
 include/asm-alpha/mmzone.h       |  3 +--
 include/asm-i386/mmzone.h        |  3 +--
 include/asm-m32r/mmzone.h        |  3 +--
 include/asm-parisc/mmzone.h      |  3 +--
 include/asm-ppc64/mmzone.h       |  3 +--
 include/asm-x86_64/mmzone.h      |  5 +----
 include/linux/mmzone.h           |  2 ++
 14 files changed, 28 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index ba81c4422aaf..c7481d59b6df 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -327,8 +327,6 @@ void __init mem_init(void)
 	extern char _text, _etext, _data, _edata;
 	extern char __init_begin, __init_end;
 	unsigned long nid, i;
-	struct page * lmem_map;
-
 	high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT);
 
 	reservedpages = 0;
@@ -338,10 +336,10 @@ void __init mem_init(void)
 		 */
 		totalram_pages += free_all_bootmem_node(NODE_DATA(nid));
 
-		lmem_map = node_mem_map(nid);
 		pfn = NODE_DATA(nid)->node_start_pfn;
 		for (i = 0; i < node_spanned_pages(nid); i++, pfn++)
-			if (page_is_ram(pfn) && PageReserved(lmem_map+i))
+			if (page_is_ram(pfn) &&
+			    PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
 	}
 
@@ -373,18 +371,18 @@ show_mem(void)
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_online_node(nid) {
-		struct page * lmem_map = node_mem_map(nid);
 		i = node_spanned_pages(nid);
 		while (i-- > 0) {
+			struct page *page = nid_page_nr(nid, i);
 			total++;
-			if (PageReserved(lmem_map+i))
+			if (PageReserved(page))
 				reserved++;
-			else if (PageSwapCache(lmem_map+i))
+			else if (PageSwapCache(page))
 				cached++;
-			else if (!page_count(lmem_map+i))
+			else if (!page_count(page))
 				free++;
 			else
-				shared += page_count(lmem_map + i) - 1;
+				shared += page_count(page) - 1;
 		}
 	}
 	printk("%ld pages of RAM\n",total);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dd81479ff88a..80c84cdf22ef 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -36,7 +36,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c00710929390..f3fd528ead3b 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -560,14 +560,15 @@ void show_mem(void)
 		int shared = 0, cached = 0, reserved = 0;
 		printk("Node ID: %d\n", pgdat->node_id);
 		for(i = 0; i < pgdat->node_spanned_pages; i++) {
+			struct page *page = pgdat_page_nr(pgdat, i);
 			if (!ia64_pfn_valid(pgdat->node_start_pfn+i))
 				continue;
-			if (PageReserved(pgdat->node_mem_map+i))
+			if (PageReserved(page))
 				reserved++;
-			else if (PageSwapCache(pgdat->node_mem_map+i))
+			else if (PageSwapCache(page))
 				cached++;
-			else if (page_count(pgdat->node_mem_map+i))
-				shared += page_count(pgdat->node_mem_map+i)-1;
+			else if (page_count(page))
+				shared += page_count(page)-1;
 		}
 		total_present += present;
 		total_reserved += reserved;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index bc423d838fb8..d9a40b1fe8ba 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -49,7 +49,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageHighMem(page))
 				highmem++;
@@ -152,7 +152,7 @@ int __init reservedpages_count(void)
 	reservedpages = 0;
 	for_each_online_node(nid)
 		for (i = 0 ; i < MAX_LOW_PFN(nid) - START_PFN(nid) ; i++)
-			if (PageReserved(NODE_DATA(nid)->node_mem_map + i))
+			if (PageReserved(nid_page_nr(nid, i)))
 				reservedpages++;
 
 	return reservedpages;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index 0a44a98d7adc..a160d04f7dbe 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -549,9 +549,8 @@ void __init mem_init(void)
 		 */
 		numslots = node_getlastslot(node);
 		for (slot = 1; slot <= numslots; slot++) {
-			p = NODE_DATA(node)->node_mem_map +
-				(slot_getbasepfn(node, slot) -
-				 slot_getbasepfn(node, 0));
+			p = nid_page_nr(node, slot_getbasepfn(node, slot) -
+					      slot_getbasepfn(node, 0));
 
 			/*
 			 * Free valid memory in current slot.
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index cac37589e35c..2886ad70db48 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -506,7 +506,7 @@ void show_mem(void)
 		for (j = node_start_pfn(i); j < node_end_pfn(i); j++) {
 			struct page *p;
 
-			p = node_mem_map(i) + j - node_start_pfn(i);
+			p = nid_page_nr(i, j) - node_start_pfn(i);
 
 			total++;
 			if (PageReserved(p))
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 6fa1e6490b57..29dbe084c21f 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -98,7 +98,7 @@ void show_mem(void)
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			total++;
 			if (PageReserved(page))
 				reserved++;
@@ -654,7 +654,7 @@ void __init mem_init(void)
 
 	for_each_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
-			page = pgdat->node_mem_map + i;
+			page = pgdat_page_nr(pgdat, i);
 			if (PageReserved(page))
 				reservedpages++;
 		}
diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index 726c150dcbe4..a011ef4cf3d3 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -57,7 +57,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
  * Given a kernel address, find the home node of the underlying memory.
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 
 #define local_mapnr(kvaddr) \
@@ -108,7 +107,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 #define pfn_to_page(pfn)						\
 ({									\
  	unsigned long kaddr = (unsigned long)__va((pfn) << PAGE_SHIFT);	\
-	(node_mem_map(kvaddr_to_nid(kaddr)) + local_mapnr(kaddr));	\
+	(NODE_DATA(kvaddr_to_nid(kaddr))->node_mem_map + local_mapnr(kaddr));	\
 })
 
 #define page_to_pfn(page)						\
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 13830ae67cac..9cec191f462c 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -79,7 +79,6 @@ static inline int pfn_to_nid(unsigned long pfn)
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -100,7 +99,7 @@ static inline int pfn_to_nid(unsigned long pfn)
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-m32r/mmzone.h b/include/asm-m32r/mmzone.h
index ebf0228fec42..d58878ec899e 100644
--- a/include/asm-m32r/mmzone.h
+++ b/include/asm-m32r/mmzone.h
@@ -14,7 +14,6 @@ extern struct pglist_data *node_data[];
 #define NODE_DATA(nid)		(node_data[nid])
 
 #define node_localnr(pfn, nid)	((pfn) - NODE_DATA(nid)->node_start_pfn)
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -32,7 +31,7 @@ extern struct pglist_data *node_data[];
 ({									\
 	unsigned long __pfn = pfn;					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-parisc/mmzone.h b/include/asm-parisc/mmzone.h
index 928bf50c4693..595d3dce120a 100644
--- a/include/asm-parisc/mmzone.h
+++ b/include/asm-parisc/mmzone.h
@@ -19,7 +19,6 @@ extern struct node_map_data node_data[];
  */
 #define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)						\
 ({									\
@@ -38,7 +37,7 @@ extern struct node_map_data node_data[];
 ({									\
 	unsigned long __pfn = (pfn);					\
 	int __node  = pfn_to_nid(__pfn);				\
-	&node_mem_map(__node)[node_localnr(__pfn,__node)];		\
+	&NODE_DATA(__node)->node_mem_map[node_localnr(__pfn,__node)];	\
 })
 
 #define page_to_pfn(pg)							\
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index 0619a41a3c9d..cbfc5ecfe875 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -65,7 +65,6 @@ static inline int pa_to_nid(unsigned long pa)
  */
 #define kvaddr_to_nid(kaddr)	pa_to_nid(__pa(kaddr))
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)	(NODE_DATA(nid)->node_end_pfn)
 
@@ -76,7 +75,7 @@ static inline int pa_to_nid(unsigned long pa)
 #define discontigmem_pfn_to_page(pfn) \
 ({ \
 	unsigned long __tmp = pfn; \
-	(node_mem_map(pfn_to_nid(__tmp)) + \
+	(NODE_DATA(pfn_to_nid(__tmp))->node_mem_map + \
 	 node_localnr(__tmp, pfn_to_nid(__tmp))); \
 })
 
diff --git a/include/asm-x86_64/mmzone.h b/include/asm-x86_64/mmzone.h
index d95b7c240831..ca4fc3fe0dee 100644
--- a/include/asm-x86_64/mmzone.h
+++ b/include/asm-x86_64/mmzone.h
@@ -35,9 +35,6 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
 #define kvaddr_to_nid(kaddr)	phys_to_nid(__pa(kaddr))
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
-
-#define node_mem_map(nid)	(NODE_DATA(nid)->node_mem_map)
 #define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
 #define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn + \
 				 NODE_DATA(nid)->node_spanned_pages)
@@ -50,7 +47,7 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
    (2.4 used to). */
 #define pfn_to_page(pfn) ({ \
 	int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT); 	\
-	((pfn) - node_start_pfn(nid)) + node_mem_map(nid);		\
+	((pfn) - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;	\
 })
 
 #define page_to_pfn(page) \
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4733d35d8223..b79633d3a97b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -284,6 +284,8 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6f167ec721108c9282d54424516a12c805e3c306 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:39 -0700
Subject: [PATCH] sparsemem base: simple NUMA remap space allocator

Introduce a simple allocator for the NUMA remap space.  This space is very
scarce, used for structures which are best allocated node local.

This mechanism is also used on non-NUMA ia64 systems with a vmem_map to keep
the pgdat->node_mem_map initialized in a consistent place for all
architectures.

Issues:
o alloc_remap takes a node_id where we might expect a pgdat which was intended
  to allow us to allocate the pgdat's using this mechanism; which we do not yet
  do.  Could have alloc_remap_node() and alloc_remap_nid() for this purpose.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig        |  5 ++++
 arch/i386/mm/discontig.c | 59 +++++++++++++++++++++++++-----------------------
 include/linux/bootmem.h  |  9 ++++++++
 mm/page_alloc.c          |  6 ++++-
 4 files changed, 50 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index dfd904f6883b..35ca3a17ed20 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -803,6 +803,11 @@ config NEED_NODE_MEMMAP_SIZE
 	depends on DISCONTIGMEM
 	default y
 
+config HAVE_ARCH_ALLOC_REMAP
+	bool
+	depends on NUMA
+	default y
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM4G || HIGHMEM64G
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 85d2fcbe1079..dcc71f969b01 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -108,6 +108,9 @@ unsigned long node_remap_offset[MAX_NUMNODES];
 void *node_remap_start_vaddr[MAX_NUMNODES];
 void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
+void *node_remap_end_vaddr[MAX_NUMNODES];
+void *node_remap_alloc_vaddr[MAX_NUMNODES];
+
 /*
  * FLAT - support for basic PC memory model with discontig enabled, essentially
  *        a single node with all available processors in it with a flat
@@ -178,6 +181,21 @@ static void __init allocate_pgdat(int nid)
 	}
 }
 
+void *alloc_remap(int nid, unsigned long size)
+{
+	void *allocation = node_remap_alloc_vaddr[nid];
+
+	size = ALIGN(size, L1_CACHE_BYTES);
+
+	if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid])
+		return 0;
+
+	node_remap_alloc_vaddr[nid] += size;
+	memset(allocation, 0, size);
+
+	return allocation;
+}
+
 void __init remap_numa_kva(void)
 {
 	void *vaddr;
@@ -185,8 +203,6 @@ void __init remap_numa_kva(void)
 	int node;
 
 	for_each_online_node(node) {
-		if (node == 0)
-			continue;
 		for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
 			vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
 			set_pmd_pfn((ulong) vaddr, 
@@ -202,11 +218,6 @@ static unsigned long calculate_numa_remap_pages(void)
 	unsigned long size, reserve_pages = 0;
 
 	for_each_online_node(nid) {
-		if (nid == 0)
-			continue;
-		if (!node_remap_size[nid])
-			continue;
-
 		/*
 		 * The acpi/srat node info can show hot-add memroy zones
 		 * where memory could be added but not currently present.
@@ -226,8 +237,8 @@ static unsigned long calculate_numa_remap_pages(void)
 		printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
 				size, nid);
 		node_remap_size[nid] = size;
-		reserve_pages += size;
 		node_remap_offset[nid] = reserve_pages;
+		reserve_pages += size;
 		printk("Shrinking node %d from %ld pages to %ld pages\n",
 			nid, node_end_pfn[nid], node_end_pfn[nid] - size);
 		node_end_pfn[nid] -= size;
@@ -280,12 +291,18 @@ unsigned long __init setup_memory(void)
 			(ulong) pfn_to_kaddr(max_low_pfn));
 	for_each_online_node(nid) {
 		node_remap_start_vaddr[nid] = pfn_to_kaddr(
-			(highstart_pfn + reserve_pages) - node_remap_offset[nid]);
+				highstart_pfn + node_remap_offset[nid]);
+		/* Init the node remap allocator */
+		node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
+			(node_remap_size[nid] * PAGE_SIZE);
+		node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
+			ALIGN(sizeof(pg_data_t), PAGE_SIZE);
+
 		allocate_pgdat(nid);
 		printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
 			(ulong) node_remap_start_vaddr[nid],
-			(ulong) pfn_to_kaddr(highstart_pfn + reserve_pages
-			    - node_remap_offset[nid] + node_remap_size[nid]));
+			(ulong) pfn_to_kaddr(highstart_pfn
+			   + node_remap_offset[nid] + node_remap_size[nid]));
 	}
 	printk("High memory starts at vaddr %08lx\n",
 			(ulong) pfn_to_kaddr(highstart_pfn));
@@ -348,23 +365,9 @@ void __init zone_sizes_init(void)
 		}
 
 		zholes_size = get_zholes_size(nid);
-		/*
-		 * We let the lmem_map for node 0 be allocated from the
-		 * normal bootmem allocator, but other nodes come from the
-		 * remapped KVA area - mbligh
-		 */
-		if (!nid)
-			free_area_init_node(nid, NODE_DATA(nid),
-					zones_size, start, zholes_size);
-		else {
-			unsigned long lmem_map;
-			lmem_map = (unsigned long)node_remap_start_vaddr[nid];
-			lmem_map += sizeof(pg_data_t) + PAGE_SIZE - 1;
-			lmem_map &= PAGE_MASK;
-			NODE_DATA(nid)->node_mem_map = (struct page *)lmem_map;
-			free_area_init_node(nid, NODE_DATA(nid), zones_size,
-				start, zholes_size);
-		}
+
+		free_area_init_node(nid, NODE_DATA(nid), zones_size, start,
+				zholes_size);
 	}
 	return;
 }
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 0dd8ca1a3d5a..500f451ce0c0 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -67,6 +67,15 @@ extern void * __init __alloc_bootmem_node (pg_data_t *pgdat, unsigned long size,
 	__alloc_bootmem_node((pgdat), (x), PAGE_SIZE, 0)
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
 
+#ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP
+extern void *alloc_remap(int nid, unsigned long size);
+#else
+static inline void *alloc_remap(int nid, unsigned long size)
+{
+	return NULL;
+}
+#endif
+
 extern unsigned long __initdata nr_kernel_pages;
 extern unsigned long __initdata nr_all_pages;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 559336de9687..bf1dd8819097 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1936,6 +1936,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
 	unsigned long size;
+	struct page *map;
 
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
@@ -1944,7 +1945,10 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
 		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
-		pgdat->node_mem_map = alloc_bootmem_node(pgdat, size);
+		map = alloc_remap(pgdat->node_id, size);
+		if (!map)
+			map = alloc_bootmem_node(pgdat, size);
+		pgdat->node_mem_map = map;
 	}
 #ifndef CONFIG_DISCONTIGMEM
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 348f8b6c4837a07304d2f72b11ce8d96588065e0 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:40 -0700
Subject: [PATCH] sparsemem base: reorganize page->flags bit operations

Generify the value fields in the page_flags.  The aim is to allow the location
and size of these fields to be varied.  Additionally we want to move away from
fixed allocations per field whilst still enforcing the overall bit utilisation
limits.  We rely on the compiler to spot and optimise the accessor functions.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h     | 53 +++++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 19 +++++++-----------
 mm/page_alloc.c        |  2 +-
 3 files changed, 52 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1813b162b0a8..57b2ead51dba 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -395,19 +395,41 @@ static inline void put_page(struct page *page)
 /*
  * The zone field is never updated after free_area_init_core()
  * sets it, so none of the operations on it need to be atomic.
- * We'll have up to (MAX_NUMNODES * MAX_NR_ZONES) zones total,
- * so we use (MAX_NODES_SHIFT + MAX_ZONES_SHIFT) here to get enough bits.
  */
-#define NODEZONE_SHIFT (sizeof(page_flags_t)*8 - MAX_NODES_SHIFT - MAX_ZONES_SHIFT)
+
+/* Page flags: | NODE | ZONE | ... | FLAGS | */
+#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * Define the bit shifts to access each section.  For non-existant
+ * sections we define the shift as 0; that plus a 0 mask ensures
+ * the compiler will optimise away reference to them.
+ */
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+
+/* NODE:ZONE is used to lookup the zone from a page. */
+#define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
+
+#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#endif
+
 #define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
 
+#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
+#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
+
 static inline unsigned long page_zonenum(struct page *page)
 {
-	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
+	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
 static inline unsigned long page_to_nid(struct page *page)
 {
-	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
+	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
 }
 
 struct zone;
@@ -415,13 +437,26 @@ extern struct zone *zone_table[];
 
 static inline struct zone *page_zone(struct page *page)
 {
-	return zone_table[page->flags >> NODEZONE_SHIFT];
+	return zone_table[(page->flags >> ZONETABLE_PGSHIFT) &
+			ZONETABLE_MASK];
+}
+
+static inline void set_page_zone(struct page *page, unsigned long zone)
+{
+	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
+	page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
+}
+static inline void set_page_node(struct page *page, unsigned long node)
+{
+	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
+	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
 
-static inline void set_page_zone(struct page *page, unsigned long nodezone_num)
+static inline void set_page_links(struct page *page, unsigned long zone,
+	unsigned long node)
 {
-	page->flags &= ~(~0UL << NODEZONE_SHIFT);
-	page->flags |= nodezone_num << NODEZONE_SHIFT;
+	set_page_zone(page, zone);
+	set_page_node(page, node);
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b79633d3a97b..39e912708e2a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -414,30 +414,25 @@ extern struct pglist_data contig_page_data;
 
 #include <asm/mmzone.h>
 
+#endif /* !CONFIG_DISCONTIGMEM */
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
  * there are 3 zones (2 bits) and this leaves 8-2=6 bits for nodes.
  */
-#define MAX_NODES_SHIFT		6
+#define FLAGS_RESERVED		8
+
 #elif BITS_PER_LONG == 64
 /*
  * with 64 bit flags field, there's plenty of room.
  */
-#define MAX_NODES_SHIFT		10
-#endif
+#define FLAGS_RESERVED		32
 
-#endif /* !CONFIG_DISCONTIGMEM */
-
-#if NODES_SHIFT > MAX_NODES_SHIFT
-#error NODES_SHIFT > MAX_NODES_SHIFT
-#endif
+#else
 
-/* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
-#define MAX_ZONES_SHIFT		2
+#error BITS_PER_LONG not defined
 
-#if ZONES_SHIFT > MAX_ZONES_SHIFT
-#error ZONES_SHIFT > MAX_ZONES_SHIFT
 #endif
 
 #endif /* !__ASSEMBLY__ */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bf1dd8819097..1958358e29b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1653,7 +1653,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	struct page *page;
 
 	for (page = start; page < (start + size); page++) {
-		set_page_zone(page, NODEZONE(nid, zone));
+		set_page_links(page, zone, nid);
 		set_page_count(page, 0);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
-- 
cgit v1.2.3-59-g8ed1b


From 93b7504e3e6c1d98586854806e51bea329ea3aa9 Mon Sep 17 00:00:00 2001
From: Dave Hansen <haveblue@us.ibm.com>
Date: Thu, 23 Jun 2005 00:07:47 -0700
Subject: [PATCH] Introduce new Kconfig option for NUMA or DISCONTIG

There is some confusion that arose when working on SPARSEMEM patch between
what is needed for DISCONTIG vs. NUMA.

Multiple pg_data_t's are needed for DISCONTIGMEM or NUMA, independently.
All of the current NUMA implementations require an implementation of
DISCONTIG.  Because of this, quite a lot of code which is really needed for
NUMA is actually under DISCONTIG #ifdefs.  For SPARSEMEM, we changed some
of these #ifdefs to CONFIG_NUMA, but that broke the DISCONTIG=y and NUMA=n
case.

Introducing this new NEED_MULTIPLE_NODES config option allows code that is
needed for both NUMA or DISCONTIG to be separated out from code that is
specific to DISCONTIG.

One great advantage of this approach is that it doesn't require every
architecture to be converted over.  All of the current implementations
should "just work", only the ones implementing SPARSEMEM will have to be
fixed up.

The change to free_area_init() makes it work inside, or out of the new
config option.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 6 +++---
 mm/Kconfig             | 8 ++++++++
 mm/page_alloc.c        | 6 +++---
 3 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 39e912708e2a..95f4a780ea66 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -402,7 +402,7 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *,
 /* Returns the number of the current Node. */
 #define numa_node_id()		(cpu_to_node(raw_smp_processor_id()))
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 
 extern struct pglist_data contig_page_data;
 #define NODE_DATA(nid)		(&contig_page_data)
@@ -410,11 +410,11 @@ extern struct pglist_data contig_page_data;
 #define MAX_NODES_SHIFT		1
 #define pfn_to_nid(pfn)		(0)
 
-#else /* CONFIG_DISCONTIGMEM */
+#else /* CONFIG_NEED_MULTIPLE_NODES */
 
 #include <asm/mmzone.h>
 
-#endif /* !CONFIG_DISCONTIGMEM */
+#endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
diff --git a/mm/Kconfig b/mm/Kconfig
index 69caa9d8674e..15c131393639 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -23,3 +23,11 @@ config DISCONTIGMEM
 
 endchoice
 
+#
+# Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
+# to represent different areas of memory.  This variable allows
+# those dependencies to exist individually.
+#
+config NEED_MULTIPLE_NODES
+	def_bool y
+	depends on DISCONTIGMEM || NUMA
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1958358e29b0..20e239599db0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1972,18 +1972,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat,
 	free_area_init_core(pgdat, zones_size, zholes_size);
 }
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 static bootmem_data_t contig_bootmem_data;
 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 
 EXPORT_SYMBOL(contig_page_data);
+#endif
 
 void __init free_area_init(unsigned long *zones_size)
 {
-	free_area_init_node(0, &contig_page_data, zones_size,
+	free_area_init_node(0, NODE_DATA(0), zones_size,
 			__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#endif
 
 #ifdef CONFIG_PROC_FS
 
-- 
cgit v1.2.3-59-g8ed1b


From b159d43fbf7eaaac6ecc647f51cf4257332db47b Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:52 -0700
Subject: [PATCH] generify early_pfn_to_nid

Provide a default implementation for early_pfn_to_nid returning node 0.  Allow
architectures to override this with their own implementation out of
asm/mmzone.h.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig         | 4 ++++
 include/asm-i386/mmzone.h | 3 +++
 include/linux/mmzone.h    | 4 ++++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 8e5242c8e09d..a8128f997339 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -810,6 +810,10 @@ config HAVE_ARCH_ALLOC_REMAP
 
 source "mm/Kconfig"
 
+config HAVE_ARCH_EARLY_PFN_TO_NID
+	bool
+	default y
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM4G || HIGHMEM64G
diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h
index 9cec191f462c..48e46d403aa6 100644
--- a/include/asm-i386/mmzone.h
+++ b/include/asm-i386/mmzone.h
@@ -143,4 +143,7 @@ static inline void get_memcfg_numa(void)
 }
 
 #endif /* CONFIG_DISCONTIGMEM */
+
+extern int early_pfn_to_nid(unsigned long pfn);
+
 #endif /* _ASM_MMZONE_H_ */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 95f4a780ea66..6ef07de98d69 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -435,6 +435,10 @@ extern struct pglist_data contig_page_data;
 
 #endif
 
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+#define early_pfn_to_nid(nid)  (0UL)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
-- 
cgit v1.2.3-59-g8ed1b


From d41dee369bff3b9dcb6328d4d822926c28cc2594 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:54 -0700
Subject: [PATCH] sparsemem memory model

Sparsemem abstracts the use of discontiguous mem_maps[].  This kind of
mem_map[] is needed by discontiguous memory machines (like in the old
CONFIG_DISCONTIGMEM case) as well as memory hotplug systems.  Sparsemem
replaces DISCONTIGMEM when enabled, and it is hoped that it can eventually
become a complete replacement.

A significant advantage over DISCONTIGMEM is that it's completely separated
from CONFIG_NUMA.  When producing this patch, it became apparent in that NUMA
and DISCONTIG are often confused.

Another advantage is that sparse doesn't require each NUMA node's ranges to be
contiguous.  It can handle overlapping ranges between nodes with no problems,
where DISCONTIGMEM currently throws away that memory.

Sparsemem uses an array to provide different pfn_to_page() translations for
each SECTION_SIZE area of physical memory.  This is what allows the mem_map[]
to be chopped up.

In order to do quick pfn_to_page() operations, the section number of the page
is encoded in page->flags.  Part of the sparsemem infrastructure enables
sharing of these bits more dynamically (at compile-time) between the
page_zone() and sparsemem operations.  However, on 32-bit architectures, the
number of bits is quite limited, and may require growing the size of the
page->flags type in certain conditions.  Several things might force this to
occur: a decrease in the SECTION_SIZE (if you want to hotplug smaller areas of
memory), an increase in the physical address space, or an increase in the
number of used page->flags.

One thing to note is that, once sparsemem is present, the NUMA node
information no longer needs to be stored in the page->flags.  It might provide
speed increases on certain platforms and will be stored there if there is
room.  But, if out of room, an alternate (theoretically slower) mechanism is
used.

This patch introduces CONFIG_FLATMEM.  It is used in almost all cases where
there used to be an #ifndef DISCONTIG, because SPARSEMEM and DISCONTIGMEM
often have to compile out the same areas of code.

Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Martin Bligh <mbligh@aracnet.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig      |  1 +
 include/linux/mm.h     | 92 ++++++++++++++++++++++++++++++++++++++---------
 include/linux/mmzone.h | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/numa.h   |  2 +-
 mm/Kconfig             | 38 ++++++++++++++++++--
 mm/Makefile            |  1 +
 mm/bootmem.c           |  9 +++--
 mm/memory.c            |  2 +-
 mm/page_alloc.c        | 39 +++++++++++++++-----
 mm/sparse.c            | 85 ++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 332 insertions(+), 33 deletions(-)
 create mode 100644 mm/sparse.c

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b7248126d29..f0064b5e3702 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -813,6 +813,7 @@ source "mm/Kconfig"
 config HAVE_ARCH_EARLY_PFN_TO_NID
 	bool
 	default y
+	depends on NUMA
 
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 57b2ead51dba..6eb7f48317f8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -397,40 +397,80 @@ static inline void put_page(struct page *page)
  * sets it, so none of the operations on it need to be atomic.
  */
 
-/* Page flags: | NODE | ZONE | ... | FLAGS | */
-#define NODES_PGOFF		((sizeof(page_flags_t)*8) - NODES_SHIFT)
-#define ZONES_PGOFF		(NODES_PGOFF - ZONES_SHIFT)
+
+/*
+ * page->flags layout:
+ *
+ * There are three possibilities for how page->flags get
+ * laid out.  The first is for the normal case, without
+ * sparsemem.  The second is for sparsemem when there is
+ * plenty of space for node and section.  The last is when
+ * we have run out of space and have to fall back to an
+ * alternate (slower) way of determining the node.
+ *
+ *        No sparsemem: |       NODE     | ZONE | ... | FLAGS |
+ * with space for node: | SECTION | NODE | ZONE | ... | FLAGS |
+ *   no space for node: | SECTION |     ZONE    | ... | FLAGS |
+ */
+#ifdef CONFIG_SPARSEMEM
+#define SECTIONS_WIDTH		SECTIONS_SHIFT
+#else
+#define SECTIONS_WIDTH		0
+#endif
+
+#define ZONES_WIDTH		ZONES_SHIFT
+
+#if SECTIONS_WIDTH+ZONES_WIDTH+NODES_SHIFT <= FLAGS_RESERVED
+#define NODES_WIDTH		NODES_SHIFT
+#else
+#define NODES_WIDTH		0
+#endif
+
+/* Page flags: | [SECTION] | [NODE] | ZONE | ... | FLAGS | */
+#define SECTIONS_PGOFF		((sizeof(page_flags_t)*8) - SECTIONS_WIDTH)
+#define NODES_PGOFF		(SECTIONS_PGOFF - NODES_WIDTH)
+#define ZONES_PGOFF		(NODES_PGOFF - ZONES_WIDTH)
+
+/*
+ * We are going to use the flags for the page to node mapping if its in
+ * there.  This includes the case where there is no node, so it is implicit.
+ */
+#define FLAGS_HAS_NODE		(NODES_WIDTH > 0 || NODES_SHIFT == 0)
+
+#ifndef PFN_SECTION_SHIFT
+#define PFN_SECTION_SHIFT 0
+#endif
 
 /*
  * Define the bit shifts to access each section.  For non-existant
  * sections we define the shift as 0; that plus a 0 mask ensures
  * the compiler will optimise away reference to them.
  */
-#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_SHIFT != 0))
-#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_SHIFT != 0))
+#define SECTIONS_PGSHIFT	(SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
+#define NODES_PGSHIFT		(NODES_PGOFF * (NODES_WIDTH != 0))
+#define ZONES_PGSHIFT		(ZONES_PGOFF * (ZONES_WIDTH != 0))
 
-/* NODE:ZONE is used to lookup the zone from a page. */
+/* NODE:ZONE or SECTION:ZONE is used to lookup the zone from a page. */
+#if FLAGS_HAS_NODE
 #define ZONETABLE_SHIFT		(NODES_SHIFT + ZONES_SHIFT)
+#else
+#define ZONETABLE_SHIFT		(SECTIONS_SHIFT + ZONES_SHIFT)
+#endif
 #define ZONETABLE_PGSHIFT	ZONES_PGSHIFT
 
-#if NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
-#error NODES_SHIFT+ZONES_SHIFT > FLAGS_RESERVED
+#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
+#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > FLAGS_RESERVED
 #endif
 
-#define NODEZONE(node, zone)	((node << ZONES_SHIFT) | zone)
-
-#define ZONES_MASK		((1UL << ZONES_SHIFT) - 1)
-#define NODES_MASK		((1UL << NODES_SHIFT) - 1)
+#define ZONES_MASK		((1UL << ZONES_WIDTH) - 1)
+#define NODES_MASK		((1UL << NODES_WIDTH) - 1)
+#define SECTIONS_MASK		((1UL << SECTIONS_WIDTH) - 1)
 #define ZONETABLE_MASK		((1UL << ZONETABLE_SHIFT) - 1)
 
 static inline unsigned long page_zonenum(struct page *page)
 {
 	return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
 }
-static inline unsigned long page_to_nid(struct page *page)
-{
-	return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
 
 struct zone;
 extern struct zone *zone_table[];
@@ -441,6 +481,18 @@ static inline struct zone *page_zone(struct page *page)
 			ZONETABLE_MASK];
 }
 
+static inline unsigned long page_to_nid(struct page *page)
+{
+	if (FLAGS_HAS_NODE)
+		return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+	else
+		return page_zone(page)->zone_pgdat->node_id;
+}
+static inline unsigned long page_to_section(struct page *page)
+{
+	return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
+}
+
 static inline void set_page_zone(struct page *page, unsigned long zone)
 {
 	page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
@@ -451,12 +503,18 @@ static inline void set_page_node(struct page *page, unsigned long node)
 	page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
 	page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
 }
+static inline void set_page_section(struct page *page, unsigned long section)
+{
+	page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
+	page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
+}
 
 static inline void set_page_links(struct page *page, unsigned long zone,
-	unsigned long node)
+	unsigned long node, unsigned long pfn)
 {
 	set_page_zone(page, zone);
 	set_page_node(page, node);
+	set_page_section(page, pfn_to_section_nr(pfn));
 }
 
 #ifndef CONFIG_DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 6ef07de98d69..19860d317ec2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -269,7 +269,9 @@ typedef struct pglist_data {
 	struct zone node_zones[MAX_NR_ZONES];
 	struct zonelist node_zonelists[GFP_ZONETYPES];
 	int nr_zones;
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 	struct page *node_mem_map;
+#endif
 	struct bootmem_data *bdata;
 	unsigned long node_start_pfn;
 	unsigned long node_present_pages; /* total number of physical pages */
@@ -284,7 +286,11 @@ typedef struct pglist_data {
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
 #define node_spanned_pages(nid)	(NODE_DATA(nid)->node_spanned_pages)
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 #define pgdat_page_nr(pgdat, pagenr)	((pgdat)->node_mem_map + (pagenr))
+#else
+#define pgdat_page_nr(pgdat, pagenr)	pfn_to_page((pgdat)->node_start_pfn + (pagenr))
+#endif
 #define nid_page_nr(nid, pagenr) 	pgdat_page_nr(NODE_DATA(nid),(pagenr))
 
 extern struct pglist_data *pgdat_list;
@@ -416,6 +422,10 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
+#ifdef CONFIG_SPARSEMEM
+#include <asm/sparsemem.h>
+#endif
+
 #if BITS_PER_LONG == 32 || defined(ARCH_HAS_ATOMIC_UNSIGNED)
 /*
  * with 32 bit page->flags field, we reserve 8 bits for node/zone info.
@@ -439,6 +449,92 @@ extern struct pglist_data contig_page_data;
 #define early_pfn_to_nid(nid)  (0UL)
 #endif
 
+#define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT)
+#define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT)
+
+#ifdef CONFIG_SPARSEMEM
+
+/*
+ * SECTION_SHIFT    		#bits space required to store a section #
+ *
+ * PA_SECTION_SHIFT		physical address to/from section number
+ * PFN_SECTION_SHIFT		pfn to/from section number
+ */
+#define SECTIONS_SHIFT		(MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
+
+#define PA_SECTION_SHIFT	(SECTION_SIZE_BITS)
+#define PFN_SECTION_SHIFT	(SECTION_SIZE_BITS - PAGE_SHIFT)
+
+#define NR_MEM_SECTIONS		(1UL << SECTIONS_SHIFT)
+
+#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
+#define PAGE_SECTION_MASK	(~(PAGES_PER_SECTION-1))
+
+#if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS
+#error Allocator MAX_ORDER exceeds SECTION_SIZE
+#endif
+
+struct page;
+struct mem_section {
+	struct page *section_mem_map;
+};
+
+extern struct mem_section mem_section[NR_MEM_SECTIONS];
+
+/*
+ * Given a kernel address, find the home node of the underlying memory.
+ */
+#define kvaddr_to_nid(kaddr)	pfn_to_nid(__pa(kaddr) >> PAGE_SHIFT)
+
+static inline struct mem_section *__pfn_to_section(unsigned long pfn)
+{
+	return &mem_section[pfn_to_section_nr(pfn)];
+}
+
+#define pfn_to_page(pfn) 						\
+({ 									\
+	unsigned long __pfn = (pfn);					\
+	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+})
+#define page_to_pfn(page)						\
+({									\
+	page - mem_section[page_to_section(page)].section_mem_map;	\
+})
+
+static inline int pfn_valid(unsigned long pfn)
+{
+	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
+		return 0;
+	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+}
+
+/*
+ * These are _only_ used during initialisation, therefore they
+ * can use __initdata ...  They could have names to indicate
+ * this restriction.
+ */
+#ifdef CONFIG_NUMA
+#define pfn_to_nid		early_pfn_to_nid
+#endif
+
+#define pfn_to_pgdat(pfn)						\
+({									\
+	NODE_DATA(pfn_to_nid(pfn));					\
+})
+
+#define early_pfn_valid(pfn)	pfn_valid(pfn)
+void sparse_init(void);
+#else
+#define sparse_init()	do {} while (0)
+#endif /* CONFIG_SPARSEMEM */
+
+#ifndef early_pfn_valid
+#define early_pfn_valid(pfn)	(1)
+#endif
+
+void memory_present(int nid, unsigned long start, unsigned long end);
+unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MMZONE_H */
diff --git a/include/linux/numa.h b/include/linux/numa.h
index bd0c8c4e9a95..f0c539bd3cfc 100644
--- a/include/linux/numa.h
+++ b/include/linux/numa.h
@@ -3,7 +3,7 @@
 
 #include <linux/config.h>
 
-#ifdef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_FLATMEM
 #include <asm/numnodes.h>
 #endif
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 5127441561b4..cd379936cac6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -6,6 +6,7 @@ choice
 	prompt "Memory model"
 	depends on SELECT_MEMORY_MODEL
 	default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
+	default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
 	default FLATMEM_MANUAL
 
 config FLATMEM_MANUAL
@@ -17,7 +18,15 @@ config FLATMEM_MANUAL
 	  only have one option here: FLATMEM.  This is normal
 	  and a correct option.
 
-	  If unsure, choose this option over any other.
+	  Some users of more advanced features like NUMA and
+	  memory hotplug may have different options here.
+	  DISCONTIGMEM is an more mature, better tested system,
+	  but is incompatible with memory hotplug and may suffer
+	  decreased performance over SPARSEMEM.  If unsure between
+	  "Sparse Memory" and "Discontiguous Memory", choose
+	  "Discontiguous Memory".
+
+	  If unsure, choose this option (Flat Memory) over any other.
 
 config DISCONTIGMEM_MANUAL
 	bool "Discontigious Memory"
@@ -35,15 +44,38 @@ config DISCONTIGMEM_MANUAL
 
 	  If unsure, choose "Flat Memory" over this option.
 
+config SPARSEMEM_MANUAL
+	bool "Sparse Memory"
+	depends on ARCH_SPARSEMEM_ENABLE
+	help
+	  This will be the only option for some systems, including
+	  memory hotplug systems.  This is normal.
+
+	  For many other systems, this will be an alternative to
+	  "Discontigious Memory".  This option provides some potential
+	  performance benefits, along with decreased code complexity,
+	  but it is newer, and more experimental.
+
+	  If unsure, choose "Discontiguous Memory" or "Flat Memory"
+	  over this option.
+
 endchoice
 
 config DISCONTIGMEM
 	def_bool y
 	depends on (!SELECT_MEMORY_MODEL && ARCH_DISCONTIGMEM_ENABLE) || DISCONTIGMEM_MANUAL
 
+config SPARSEMEM
+	def_bool y
+	depends on SPARSEMEM_MANUAL
+
 config FLATMEM
 	def_bool y
-	depends on !DISCONTIGMEM || FLATMEM_MANUAL
+	depends on (!DISCONTIGMEM && !SPARSEMEM) || FLATMEM_MANUAL
+
+config FLAT_NODE_MEM_MAP
+	def_bool y
+	depends on !SPARSEMEM
 
 #
 # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's
@@ -56,4 +88,4 @@ config NEED_MULTIPLE_NODES
 
 config HAVE_MEMORY_PRESENT
 	def_bool y
-	depends on ARCH_HAVE_MEMORY_PRESENT
+	depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
diff --git a/mm/Makefile b/mm/Makefile
index 097408064f6a..8f70ffd763c8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,6 +15,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
+obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 260e703850d8..f82f7aebbee3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -256,6 +256,7 @@ found:
 static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 {
 	struct page *page;
+	unsigned long pfn;
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long i, count, total = 0;
 	unsigned long idx;
@@ -266,7 +267,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 
 	count = 0;
 	/* first extant page of the node */
-	page = virt_to_page(phys_to_virt(bdata->node_boot_start));
+	pfn = bdata->node_boot_start >> PAGE_SHIFT;
 	idx = bdata->node_low_pfn - (bdata->node_boot_start >> PAGE_SHIFT);
 	map = bdata->node_bootmem_map;
 	/* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
@@ -275,9 +276,11 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 		gofast = 1;
 	for (i = 0; i < idx; ) {
 		unsigned long v = ~map[i / BITS_PER_LONG];
+
 		if (gofast && v == ~0UL) {
 			int j, order;
 
+			page = pfn_to_page(pfn);
 			count += BITS_PER_LONG;
 			__ClearPageReserved(page);
 			order = ffs(BITS_PER_LONG) - 1;
@@ -292,6 +295,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 			page += BITS_PER_LONG;
 		} else if (v) {
 			unsigned long m;
+
+			page = pfn_to_page(pfn);
 			for (m = 1; m && i < idx; m<<=1, page++, i++) {
 				if (v & m) {
 					count++;
@@ -302,8 +307,8 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 			}
 		} else {
 			i+=BITS_PER_LONG;
-			page += BITS_PER_LONG;
 		}
+		pfn += BITS_PER_LONG;
 	}
 	total += count;
 
diff --git a/mm/memory.c b/mm/memory.c
index da91b7bf9986..30975ef48722 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -58,7 +58,7 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
 
-#ifndef CONFIG_DISCONTIGMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
 struct page *mem_map;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20e239599db0..5c1b8982a6da 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
+struct zone *zone_table[1 << ZONETABLE_SHIFT];
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
@@ -1649,11 +1649,15 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
-	struct page *start = pfn_to_page(start_pfn);
 	struct page *page;
+	int end_pfn = start_pfn + size;
+	int pfn;
 
-	for (page = start; page < (start + size); page++) {
-		set_page_links(page, zone, nid);
+	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+		if (!early_pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 0);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
@@ -1677,6 +1681,20 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
 	}
 }
 
+#define ZONETABLE_INDEX(x, zone_nr)	((x << ZONES_SHIFT) | zone_nr)
+void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
+		unsigned long size)
+{
+	unsigned long snum = pfn_to_section_nr(pfn);
+	unsigned long end = pfn_to_section_nr(pfn + size);
+
+	if (FLAGS_HAS_NODE)
+		zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
+	else
+		for (; snum <= end; snum++)
+			zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
+}
+
 #ifndef __HAVE_ARCH_MEMMAP_INIT
 #define memmap_init(size, nid, zone, start_pfn) \
 	memmap_init_zone((size), (nid), (zone), (start_pfn))
@@ -1861,7 +1879,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		unsigned long size, realsize;
 		unsigned long batch;
 
-		zone_table[NODEZONE(nid, j)] = zone;
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
@@ -1927,6 +1944,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 
 		memmap_init(size, nid, j, zone_start_pfn);
 
+		zonetable_add(zone, nid, j, zone_start_pfn, size);
+
 		zone_start_pfn += size;
 
 		zone_init_free_lists(pgdat, zone, zone->spanned_pages);
@@ -1935,28 +1954,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 
 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 {
-	unsigned long size;
-	struct page *map;
-
 	/* Skip empty nodes */
 	if (!pgdat->node_spanned_pages)
 		return;
 
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
 	/* ia64 gets its own node_mem_map, before this, without bootmem */
 	if (!pgdat->node_mem_map) {
+		unsigned long size;
+		struct page *map;
+
 		size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
 		map = alloc_remap(pgdat->node_id, size);
 		if (!map)
 			map = alloc_bootmem_node(pgdat, size);
 		pgdat->node_mem_map = map;
 	}
-#ifndef CONFIG_DISCONTIGMEM
+#ifdef CONFIG_FLATMEM
 	/*
 	 * With no DISCONTIG, the global mem_map is just set as node 0's
 	 */
 	if (pgdat == NODE_DATA(0))
 		mem_map = NODE_DATA(0)->node_mem_map;
 #endif
+#endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
 
 void __init free_area_init_node(int nid, struct pglist_data *pgdat,
diff --git a/mm/sparse.c b/mm/sparse.c
new file mode 100644
index 000000000000..f888385b9e14
--- /dev/null
+++ b/mm/sparse.c
@@ -0,0 +1,85 @@
+/*
+ * sparse memory mappings.
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/dma.h>
+
+/*
+ * Permanent SPARSEMEM data:
+ *
+ * 1) mem_section	- memory sections, mem_map's for valid memory
+ */
+struct mem_section mem_section[NR_MEM_SECTIONS];
+EXPORT_SYMBOL(mem_section);
+
+/* Record a memory area against a node. */
+void memory_present(int nid, unsigned long start, unsigned long end)
+{
+	unsigned long pfn;
+
+	start &= PAGE_SECTION_MASK;
+	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
+		unsigned long section = pfn_to_section_nr(pfn);
+		if (!mem_section[section].section_mem_map)
+			mem_section[section].section_mem_map = (void *) -1;
+	}
+}
+
+/*
+ * Only used by the i386 NUMA architecures, but relatively
+ * generic code.
+ */
+unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
+						     unsigned long end_pfn)
+{
+	unsigned long pfn;
+	unsigned long nr_pages = 0;
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		if (nid != early_pfn_to_nid(pfn))
+			continue;
+
+		if (pfn_valid(pfn))
+			nr_pages += PAGES_PER_SECTION;
+	}
+
+	return nr_pages * sizeof(struct page);
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void sparse_init(void)
+{
+	unsigned long pnum;
+	struct page *map;
+	int nid;
+
+	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+		if (!mem_section[pnum].section_mem_map)
+			continue;
+
+		nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+		map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+		if (!map)
+			map = alloc_bootmem_node(NODE_DATA(nid),
+				sizeof(struct page) * PAGES_PER_SECTION);
+		if (!map) {
+			mem_section[pnum].section_mem_map = 0;
+			continue;
+		}
+
+		/*
+		 * Subtle, we encode the real pfn into the mem_map such that
+		 * the identity pfn - section_mem_map will return the actual
+		 * physical page frame number.
+		 */
+		mem_section[pnum].section_mem_map = map -
+						section_nr_to_pfn(pnum);
+	}
+}
-- 
cgit v1.2.3-59-g8ed1b


From 641c767389b19859a45e6de46d8e18cd935bdb60 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:07:59 -0700
Subject: [PATCH] sparsemem swiss cheese numa layouts

The part of the sparsemem patch which modifies memmap_init_zone() has recently
become a problem.  It changes behavior so that there is a call to
pfn_to_page() for each individual page inside of a node's range:
node_start_pfn through node_end_pfn.  It used to simply do this once, at the
beginning of the node, but having sparsemem's non-contiguous mem_map[]s inside
of a node made it necessary to change.

Mike Kravetz recently wrote a patch which made the NUMA code accept some new
kinds of layouts.  The system's memory was laid out like this, with node 0's
memory in two pieces: one before and one after node 1's memory:

	Node 0: +++++     +++++
	Node 1:      +++++

Previous behavior before Mike's patch was to assign nodes like this:

	Node 0: 00000     XXXXX
	Node 1:      11111

Where the 'X' areas were simply thrown away.  The new behavior was to make the
pg_data_t span node 0 across all of its areas, including areas that are really
node 1's: Node 0: 000000000000000 Node 1: 11111

This wastes a little bit of mem_map space, but ends up being OK, and more
fully utilizes the system's memory.  memmap_init_zone() initializes all of the
"struct page"s for node 0, even for the "hole", but those never get used,
because there is no pfn_to_page() that resolves to those pages.  However, only
calling pfn_to_page() once, memmap_init_zone() always uses the pages that were
allocated for node0->node_mem_map because:

	struct page *start = pfn_to_page(start_pfn);
	// effectively start = &node->node_mem_map[0]
	for (page = start; page < (start + size); page++) {
		init_page_here();...
		page++;
	}

Slow, and wasteful, but generally harmless.

But, modify that to call pfn_to_page() for each loop iteration (like sparsemem
does):

	for (pfn = start_pfn; pfn < < (start_pfn + size); pfn++++) {
		page = pfn_to_page(pfn);
	}

And you end up trying to initialize node 1's pages too early, along with bogus
data from node 0.  This patch checks for those weird layouts and declines to
touch the pages, making the more frequent pfn_to_page() calls OK to do.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/Kconfig     | 12 ++++++++++++
 include/linux/mmzone.h |  6 ++++++
 mm/page_alloc.c        |  2 ++
 3 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index 011b5c0bf1d0..85f8fcf44b6c 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -211,6 +211,18 @@ config ARCH_FLATMEM_ENABLE
 
 source "mm/Kconfig"
 
+# Some NUMA nodes have memory ranges that span
+# other nodes.  Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node.
+#
+# This is a relatively temporary hack that should
+# be able to go away when sparsemem is fully in
+# place
+config NODES_SPAN_OTHER_NODES
+	def_bool y
+	depends on NEED_MULTIPLE_NODES
+
 config NUMA
 	bool "NUMA support"
 	depends on DISCONTIGMEM
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 19860d317ec2..746b57e3d370 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -528,6 +528,12 @@ void sparse_init(void);
 #define sparse_init()	do {} while (0)
 #endif /* CONFIG_SPARSEMEM */
 
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+#define early_pfn_in_nid(pfn, nid)	(early_pfn_to_nid(pfn) == (nid))
+#else
+#define early_pfn_in_nid(pfn, nid)	(1)
+#endif
+
 #ifndef early_pfn_valid
 #define early_pfn_valid(pfn)	(1)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c1b8982a6da..1eb683f9b3af 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1656,6 +1656,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
 		if (!early_pfn_valid(pfn))
 			continue;
+		if (!early_pfn_in_nid(pfn, nid))
+			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
 		set_page_count(page, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 29751f6991e845f7d002a6ae520bf996b38c8dcd Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@shadowen.org>
Date: Thu, 23 Jun 2005 00:08:00 -0700
Subject: [PATCH] sparsemem hotplug base

Make sparse's initalization be accessible at runtime.  This allows sparse
mappings to be created after boot in a hotplug situation.

This patch is separated from the previous one just to give an indication how
much of the sparse infrastructure is *just* for hotplug memory.

The section_mem_map doesn't really store a pointer.  It stores something that
is convenient to do some math against to get a pointer.  It isn't valid to
just do *section_mem_map, so I don't think it should be stored as a pointer.

There are a couple of things I'd like to store about a section.  First of all,
the fact that it is !NULL does not mean that it is present.  There could be
such a combination where section_mem_map *is* NULL, but the math gets you
properly to a real mem_map.  So, I don't think that check is safe.

Since we're storing 32-bit-aligned structures, we have a few bits in the
bottom of the pointer to play with.  Use one bit to encode whether there's
really a mem_map there, and the other one to tell whether there's a valid
section there.  We need to distinguish between the two because sometimes
there's a gap between when a section is discovered to be present and when we
can get the mem_map for it.

Signed-off-by: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Jack Steiner <steiner@sgi.com>
Signed-off-by: Bob Picco <bob.picco@hp.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 56 +++++++++++++++++++++++++++---
 mm/page_alloc.c        |  4 +--
 mm/sparse.c            | 92 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 125 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 746b57e3d370..6c90461ed99f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -476,11 +476,56 @@ extern struct pglist_data contig_page_data;
 
 struct page;
 struct mem_section {
-	struct page *section_mem_map;
+	/*
+	 * This is, logically, a pointer to an array of struct
+	 * pages.  However, it is stored with some other magic.
+	 * (see sparse.c::sparse_init_one_section())
+	 *
+	 * Making it a UL at least makes someone do a cast
+	 * before using it wrong.
+	 */
+	unsigned long section_mem_map;
 };
 
 extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
+static inline struct mem_section *__nr_to_section(unsigned long nr)
+{
+	return &mem_section[nr];
+}
+
+/*
+ * We use the lower bits of the mem_map pointer to store
+ * a little bit of information.  There should be at least
+ * 3 bits here due to 32-bit alignment.
+ */
+#define	SECTION_MARKED_PRESENT	(1UL<<0)
+#define SECTION_HAS_MEM_MAP	(1UL<<1)
+#define SECTION_MAP_LAST_BIT	(1UL<<2)
+#define SECTION_MAP_MASK	(~(SECTION_MAP_LAST_BIT-1))
+
+static inline struct page *__section_mem_map_addr(struct mem_section *section)
+{
+	unsigned long map = section->section_mem_map;
+	map &= SECTION_MAP_MASK;
+	return (struct page *)map;
+}
+
+static inline int valid_section(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_MARKED_PRESENT);
+}
+
+static inline int section_has_mem_map(struct mem_section *section)
+{
+	return (section->section_mem_map & SECTION_HAS_MEM_MAP);
+}
+
+static inline int valid_section_nr(unsigned long nr)
+{
+	return valid_section(__nr_to_section(nr));
+}
+
 /*
  * Given a kernel address, find the home node of the underlying memory.
  */
@@ -488,24 +533,25 @@ extern struct mem_section mem_section[NR_MEM_SECTIONS];
 
 static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 {
-	return &mem_section[pfn_to_section_nr(pfn)];
+	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
 #define pfn_to_page(pfn) 						\
 ({ 									\
 	unsigned long __pfn = (pfn);					\
-	__pfn_to_section(__pfn)->section_mem_map + __pfn;		\
+	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
 })
 #define page_to_pfn(page)						\
 ({									\
-	page - mem_section[page_to_section(page)].section_mem_map;	\
+	page - __section_mem_map_addr(__nr_to_section(			\
+		page_to_section(page)));				\
 })
 
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
 		return 0;
-	return mem_section[pfn_to_section_nr(pfn)].section_mem_map != 0;
+	return valid_section(__nr_to_section(pfn_to_section_nr(pfn)));
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1eb683f9b3af..7ee675ad101e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1650,8 +1650,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 		unsigned long start_pfn)
 {
 	struct page *page;
-	int end_pfn = start_pfn + size;
-	int pfn;
+	unsigned long end_pfn = start_pfn + size;
+	unsigned long pfn;
 
 	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
 		if (!early_pfn_valid(pfn))
diff --git a/mm/sparse.c b/mm/sparse.c
index f888385b9e14..b54e304df4a7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -25,7 +25,7 @@ void memory_present(int nid, unsigned long start, unsigned long end)
 	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
 		unsigned long section = pfn_to_section_nr(pfn);
 		if (!mem_section[section].section_mem_map)
-			mem_section[section].section_mem_map = (void *) -1;
+			mem_section[section].section_mem_map = SECTION_MARKED_PRESENT;
 	}
 }
 
@@ -50,6 +50,56 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
 	return nr_pages * sizeof(struct page);
 }
 
+/*
+ * Subtle, we encode the real pfn into the mem_map such that
+ * the identity pfn - section_mem_map will return the actual
+ * physical page frame number.
+ */
+static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
+{
+	return (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
+}
+
+/*
+ * We need this if we ever free the mem_maps.  While not implemented yet,
+ * this function is included for parity with its sibling.
+ */
+static __attribute((unused))
+struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
+{
+	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
+}
+
+static int sparse_init_one_section(struct mem_section *ms,
+		unsigned long pnum, struct page *mem_map)
+{
+	if (!valid_section(ms))
+		return -EINVAL;
+
+	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+
+	return 1;
+}
+
+static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+	struct page *map;
+	int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
+
+	map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
+	if (map)
+		return map;
+
+	map = alloc_bootmem_node(NODE_DATA(nid),
+			sizeof(struct page) * PAGES_PER_SECTION);
+	if (map)
+		return map;
+
+	printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+	mem_section[pnum].section_mem_map = 0;
+	return NULL;
+}
+
 /*
  * Allocate the accumulated non-linear sections, allocate a mem_map
  * for each and record the physical to section mapping.
@@ -58,28 +108,30 @@ void sparse_init(void)
 {
 	unsigned long pnum;
 	struct page *map;
-	int nid;
 
 	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-		if (!mem_section[pnum].section_mem_map)
+		if (!valid_section_nr(pnum))
 			continue;
 
-		nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
-		map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
-		if (!map)
-			map = alloc_bootmem_node(NODE_DATA(nid),
-				sizeof(struct page) * PAGES_PER_SECTION);
-		if (!map) {
-			mem_section[pnum].section_mem_map = 0;
-			continue;
-		}
-
-		/*
-		 * Subtle, we encode the real pfn into the mem_map such that
-		 * the identity pfn - section_mem_map will return the actual
-		 * physical page frame number.
-		 */
-		mem_section[pnum].section_mem_map = map -
-						section_nr_to_pfn(pnum);
+		map = sparse_early_mem_map_alloc(pnum);
+		if (map)
+			sparse_init_one_section(&mem_section[pnum], pnum, map);
 	}
 }
+
+/*
+ * returns the number of sections whose mem_maps were properly
+ * set.  If this is <=0, then that means that the passed-in
+ * map was not consumed and must be freed.
+ */
+int sparse_add_one_section(unsigned long start_pfn, int nr_pages, struct page *map)
+{
+	struct mem_section *ms = __pfn_to_section(start_pfn);
+
+	if (ms->section_mem_map & SECTION_MARKED_PRESENT)
+		return -EEXIST;
+
+	ms->section_mem_map |= SECTION_MARKED_PRESENT;
+
+	return sparse_init_one_section(ms, pfn_to_section_nr(start_pfn), map);
+}
-- 
cgit v1.2.3-59-g8ed1b


From 1946089a109251655c5438d92c539bd2930e71ea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 23 Jun 2005 00:08:19 -0700
Subject: [PATCH] NUMA aware block device control structure allocation

Patch to allocate the control structures for for ide devices on the node of
the device itself (for NUMA systems).  The patch depends on the Slab API
change patch by Manfred and me (in mm) and the pcidev_to_node patch that I
posted today.

Does some realignment too.

Signed-off-by: Justin M. Forbes <jmforbes@linuxtx.org>
Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Pravin Shelar <pravin@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/as-iosched.c       |  8 +++++---
 drivers/block/deadline-iosched.c |  8 +++++---
 drivers/block/genhd.c            | 13 ++++++++++---
 drivers/block/ll_rw_blk.c        | 30 +++++++++++++++++++++++-------
 drivers/ide/ide-disk.c           |  3 ++-
 drivers/ide/ide-probe.c          |  8 +++++---
 include/linux/blkdev.h           |  6 +++++-
 include/linux/genhd.h            |  1 +
 include/linux/ide.h              |  2 +-
 include/linux/mempool.h          | 11 ++++++++---
 mm/mempool.c                     | 17 ++++++++++++-----
 11 files changed, 77 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 638db06de2be..3410b4d294b9 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1871,20 +1871,22 @@ static int as_init_queue(request_queue_t *q, elevator_t *e)
 	if (!arq_pool)
 		return -ENOMEM;
 
-	ad = kmalloc(sizeof(*ad), GFP_KERNEL);
+	ad = kmalloc_node(sizeof(*ad), GFP_KERNEL, q->node);
 	if (!ad)
 		return -ENOMEM;
 	memset(ad, 0, sizeof(*ad));
 
 	ad->q = q; /* Identify what queue the data belongs to */
 
-	ad->hash = kmalloc(sizeof(struct list_head)*AS_HASH_ENTRIES,GFP_KERNEL);
+	ad->hash = kmalloc_node(sizeof(struct list_head)*AS_HASH_ENTRIES,
+				GFP_KERNEL, q->node);
 	if (!ad->hash) {
 		kfree(ad);
 		return -ENOMEM;
 	}
 
-	ad->arq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, arq_pool);
+	ad->arq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
+				mempool_free_slab, arq_pool, q->node);
 	if (!ad->arq_pool) {
 		kfree(ad->hash);
 		kfree(ad);
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index 7f79f3dd0165..4bc2fea73273 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -711,18 +711,20 @@ static int deadline_init_queue(request_queue_t *q, elevator_t *e)
 	if (!drq_pool)
 		return -ENOMEM;
 
-	dd = kmalloc(sizeof(*dd), GFP_KERNEL);
+	dd = kmalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
 	if (!dd)
 		return -ENOMEM;
 	memset(dd, 0, sizeof(*dd));
 
-	dd->hash = kmalloc(sizeof(struct list_head)*DL_HASH_ENTRIES,GFP_KERNEL);
+	dd->hash = kmalloc_node(sizeof(struct list_head)*DL_HASH_ENTRIES,
+				GFP_KERNEL, q->node);
 	if (!dd->hash) {
 		kfree(dd);
 		return -ENOMEM;
 	}
 
-	dd->drq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, drq_pool);
+	dd->drq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
+					mempool_free_slab, drq_pool, q->node);
 	if (!dd->drq_pool) {
 		kfree(dd->hash);
 		kfree(dd);
diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c
index 53f7d846b747..43805e4d31e9 100644
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
@@ -582,10 +582,16 @@ struct seq_operations diskstats_op = {
 	.show	= diskstats_show
 };
 
-
 struct gendisk *alloc_disk(int minors)
 {
-	struct gendisk *disk = kmalloc(sizeof(struct gendisk), GFP_KERNEL);
+	return alloc_disk_node(minors, -1);
+}
+
+struct gendisk *alloc_disk_node(int minors, int node_id)
+{
+	struct gendisk *disk;
+
+	disk = kmalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
 	if (disk) {
 		memset(disk, 0, sizeof(struct gendisk));
 		if (!init_disk_stats(disk)) {
@@ -594,7 +600,7 @@ struct gendisk *alloc_disk(int minors)
 		}
 		if (minors > 1) {
 			int size = (minors - 1) * sizeof(struct hd_struct *);
-			disk->part = kmalloc(size, GFP_KERNEL);
+			disk->part = kmalloc_node(size, GFP_KERNEL, node_id);
 			if (!disk->part) {
 				kfree(disk);
 				return NULL;
@@ -610,6 +616,7 @@ struct gendisk *alloc_disk(int minors)
 }
 
 EXPORT_SYMBOL(alloc_disk);
+EXPORT_SYMBOL(alloc_disk_node);
 
 struct kobject *get_disk(struct gendisk *disk)
 {
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 81fe3a0c1fe7..cd8cf302068c 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 /*
  * for max sense size
@@ -1645,7 +1646,8 @@ static int blk_init_free_list(request_queue_t *q)
 	init_waitqueue_head(&rl->wait[WRITE]);
 	init_waitqueue_head(&rl->drain);
 
-	rl->rq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, request_cachep);
+	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
+				mempool_free_slab, request_cachep, q->node);
 
 	if (!rl->rq_pool)
 		return -ENOMEM;
@@ -1657,8 +1659,15 @@ static int __make_request(request_queue_t *, struct bio *);
 
 request_queue_t *blk_alloc_queue(int gfp_mask)
 {
-	request_queue_t *q = kmem_cache_alloc(requestq_cachep, gfp_mask);
+	return blk_alloc_queue_node(gfp_mask, -1);
+}
+EXPORT_SYMBOL(blk_alloc_queue);
+
+request_queue_t *blk_alloc_queue_node(int gfp_mask, int node_id)
+{
+	request_queue_t *q;
 
+	q = kmem_cache_alloc_node(requestq_cachep, gfp_mask, node_id);
 	if (!q)
 		return NULL;
 
@@ -1671,8 +1680,7 @@ request_queue_t *blk_alloc_queue(int gfp_mask)
 
 	return q;
 }
-
-EXPORT_SYMBOL(blk_alloc_queue);
+EXPORT_SYMBOL(blk_alloc_queue_node);
 
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
@@ -1705,13 +1713,22 @@ EXPORT_SYMBOL(blk_alloc_queue);
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
+
 request_queue_t *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
-	request_queue_t *q = blk_alloc_queue(GFP_KERNEL);
+	return blk_init_queue_node(rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_queue);
+
+request_queue_t *
+blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
+{
+	request_queue_t *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 
 	if (!q)
 		return NULL;
 
+	q->node = node_id;
 	if (blk_init_free_list(q))
 		goto out_init;
 
@@ -1754,8 +1771,7 @@ out_init:
 	kmem_cache_free(requestq_cachep, q);
 	return NULL;
 }
-
-EXPORT_SYMBOL(blk_init_queue);
+EXPORT_SYMBOL(blk_init_queue_node);
 
 int blk_get_queue(request_queue_t *q)
 {
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 3302cd8eab4c..d6f934886b04 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1215,7 +1215,8 @@ static int ide_disk_probe(struct device *dev)
 	if (!idkp)
 		goto failed;
 
-	g = alloc_disk(1 << PARTN_BITS);
+	g = alloc_disk_node(1 << PARTN_BITS,
+			pcibus_to_node(drive->hwif->pci_dev->bus));
 	if (!g)
 		goto out_free_idkp;
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 5d876f53c697..7df85af75371 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -977,8 +977,9 @@ static int ide_init_queue(ide_drive_t *drive)
 	 *	limits and LBA48 we could raise it but as yet
 	 *	do not.
 	 */
-	 
-	q = blk_init_queue(do_ide_request, &ide_lock);
+
+	q = blk_init_queue_node(do_ide_request, &ide_lock,
+				pcibus_to_node(drive->hwif->pci_dev->bus));
 	if (!q)
 		return 1;
 
@@ -1095,7 +1096,8 @@ static int init_irq (ide_hwif_t *hwif)
 		hwgroup->hwif->next = hwif;
 		spin_unlock_irq(&ide_lock);
 	} else {
-		hwgroup = kmalloc(sizeof(ide_hwgroup_t),GFP_KERNEL);
+		hwgroup = kmalloc_node(sizeof(ide_hwgroup_t), GFP_KERNEL,
+			pcibus_to_node(hwif->drives[0].hwif->pci_dev->bus));
 		if (!hwgroup)
 	       		goto out_up;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4a99b76c5a33..235c3414d268 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -396,6 +396,7 @@ struct request_queue
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
+	int			node;
 
 	struct list_head	drain_list;
 
@@ -615,6 +616,8 @@ static inline void blkdev_dequeue_request(struct request *req)
 /*
  * Access functions for manipulating queue properties
  */
+extern request_queue_t *blk_init_queue_node(request_fn_proc *rfn,
+					spinlock_t *lock, int node_id);
 extern request_queue_t *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(request_queue_t *);
 extern void blk_queue_make_request(request_queue_t *, make_request_fn *);
@@ -646,7 +649,8 @@ extern void blk_wait_queue_drained(request_queue_t *, int);
 extern void blk_finish_queue_drain(request_queue_t *);
 
 int blk_get_queue(request_queue_t *);
-request_queue_t *blk_alloc_queue(int);
+request_queue_t *blk_alloc_queue(int gfp_mask);
+request_queue_t *blk_alloc_queue_node(int,int);
 #define blk_put_queue(q) blk_cleanup_queue((q))
 
 /*
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 47dedaf971d6..af26dc718ef6 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -403,6 +403,7 @@ extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
 extern void add_partition(struct gendisk *, int, sector_t, sector_t);
 extern void delete_partition(struct gendisk *, int);
 
+extern struct gendisk *alloc_disk_node(int minors, int node_id);
 extern struct gendisk *alloc_disk(int minors);
 extern struct kobject *get_disk(struct gendisk *disk);
 extern void put_disk(struct gendisk *disk);
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 336d6e509f59..92129078d4f3 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -917,7 +917,7 @@ typedef struct hwif_s {
 	unsigned dma;
 
 	void (*led_act)(void *data, int rw);
-} ide_hwif_t;
+} ____cacheline_maxaligned_in_smp ide_hwif_t;
 
 /*
  *  internal ide interrupt handler type
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 4a36edf1c974..796220ce47cc 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -20,9 +20,14 @@ typedef struct mempool_s {
 	mempool_free_t *free;
 	wait_queue_head_t wait;
 } mempool_t;
-extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
-				 mempool_free_t *free_fn, void *pool_data);
-extern int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask);
+
+extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data);
+extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data, int nid);
+
+extern int mempool_resize(mempool_t *pool, int new_min_nr,
+			unsigned int __nocast gfp_mask);
 extern void mempool_destroy(mempool_t *pool);
 extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
diff --git a/mm/mempool.c b/mm/mempool.c
index c9f3d4620428..920c8c3ab1b8 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -51,16 +51,23 @@ static void free_pool(mempool_t *pool)
  * functions might sleep - as long as the mempool_alloc function is not called
  * from IRQ contexts.
  */
-mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 				mempool_free_t *free_fn, void *pool_data)
 {
-	mempool_t *pool;
+	return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+}
+EXPORT_SYMBOL(mempool_create);
 
-	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
+			mempool_free_t *free_fn, void *pool_data, int node_id)
+{
+	mempool_t *pool;
+	pool = kmalloc_node(sizeof(*pool), GFP_KERNEL, node_id);
 	if (!pool)
 		return NULL;
 	memset(pool, 0, sizeof(*pool));
-	pool->elements = kmalloc(min_nr * sizeof(void *), GFP_KERNEL);
+	pool->elements = kmalloc_node(min_nr * sizeof(void *),
+					GFP_KERNEL, node_id);
 	if (!pool->elements) {
 		kfree(pool);
 		return NULL;
@@ -87,7 +94,7 @@ mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 	}
 	return pool;
 }
-EXPORT_SYMBOL(mempool_create);
+EXPORT_SYMBOL(mempool_create_node);
 
 /**
  * mempool_resize - resize an existing memory pool
-- 
cgit v1.2.3-59-g8ed1b


From fa72b903f75e4f0f0b2c2feed093005167da4023 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:49 -0700
Subject: [PATCH] blk: remove blk_queue_tag->real_max_depth optimization

blk_queue_tag->real_max_depth was used to optimize out unnecessary
allocations/frees on tag resize.  However, the whole thing was very broken -
tag_map was never allocated to real_max_depth resulting in access beyond the
end of the map, bits in [max_depth..real_max_depth] were set when initializing
a map and copied when resizing resulting in pre-occupied tags.

As the gain of the optimization is very small, well, almost nill, remove the
whole thing.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 35 ++++++++++-------------------------
 include/linux/blkdev.h    |  1 -
 2 files changed, 10 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 808390c74200..896d17c28f42 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -717,7 +717,7 @@ struct request *blk_queue_find_tag(request_queue_t *q, int tag)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 
-	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
+	if (unlikely(bqt == NULL || tag >= bqt->max_depth))
 		return NULL;
 
 	return bqt->tag_index[tag];
@@ -775,9 +775,9 @@ EXPORT_SYMBOL(blk_queue_free_tags);
 static int
 init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 {
-	int bits, i;
 	struct request **tag_index;
 	unsigned long *tag_map;
+	int nr_ulongs;
 
 	if (depth > q->nr_requests * 2) {
 		depth = q->nr_requests * 2;
@@ -789,24 +789,17 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 	if (!tag_index)
 		goto fail;
 
-	bits = (depth / BLK_TAGS_PER_LONG) + 1;
-	tag_map = kmalloc(bits * sizeof(unsigned long), GFP_ATOMIC);
+	nr_ulongs = ALIGN(depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG;
+	tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
 	if (!tag_map)
 		goto fail;
 
 	memset(tag_index, 0, depth * sizeof(struct request *));
-	memset(tag_map, 0, bits * sizeof(unsigned long));
+	memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
 	tags->max_depth = depth;
-	tags->real_max_depth = bits * BITS_PER_LONG;
 	tags->tag_index = tag_index;
 	tags->tag_map = tag_map;
 
-	/*
-	 * set the upper bits if the depth isn't a multiple of the word size
-	 */
-	for (i = depth; i < bits * BLK_TAGS_PER_LONG; i++)
-		__set_bit(i, tag_map);
-
 	return 0;
 fail:
 	kfree(tag_index);
@@ -871,32 +864,24 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth)
 	struct blk_queue_tag *bqt = q->queue_tags;
 	struct request **tag_index;
 	unsigned long *tag_map;
-	int bits, max_depth;
+	int max_depth, nr_ulongs;
 
 	if (!bqt)
 		return -ENXIO;
 
-	/*
-	 * don't bother sizing down
-	 */
-	if (new_depth <= bqt->real_max_depth) {
-		bqt->max_depth = new_depth;
-		return 0;
-	}
-
 	/*
 	 * save the old state info, so we can copy it back
 	 */
 	tag_index = bqt->tag_index;
 	tag_map = bqt->tag_map;
-	max_depth = bqt->real_max_depth;
+	max_depth = bqt->max_depth;
 
 	if (init_tag_map(q, bqt, new_depth))
 		return -ENOMEM;
 
 	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
-	bits = max_depth / BLK_TAGS_PER_LONG;
-	memcpy(bqt->tag_map, tag_map, bits * sizeof(unsigned long));
+	nr_ulongs = ALIGN(max_depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG;
+	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
 
 	kfree(tag_index);
 	kfree(tag_map);
@@ -926,7 +911,7 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq)
 
 	BUG_ON(tag == -1);
 
-	if (unlikely(tag >= bqt->real_max_depth))
+	if (unlikely(tag >= bqt->max_depth))
 		return;
 
 	if (unlikely(!__test_and_clear_bit(tag, bqt->tag_map))) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 235c3414d268..8d7e2f4151d0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -294,7 +294,6 @@ struct blk_queue_tag {
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
-	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f7d37d028dfba90b1b747f8ac685bf0959aeda8b Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 23 Jun 2005 00:08:50 -0700
Subject: [PATCH] blk: remove BLK_TAGS_{PER_LONG|MASK}

Replace BLK_TAGS_PER_LONG with BITS_PER_LONG and remove unused BLK_TAGS_MASK.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 4 ++--
 include/linux/blkdev.h    | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 896d17c28f42..99afeec1031f 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -789,7 +789,7 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 	if (!tag_index)
 		goto fail;
 
-	nr_ulongs = ALIGN(depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG;
+	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
 	tag_map = kmalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
 	if (!tag_map)
 		goto fail;
@@ -880,7 +880,7 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth)
 		return -ENOMEM;
 
 	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
-	nr_ulongs = ALIGN(max_depth, BLK_TAGS_PER_LONG) / BLK_TAGS_PER_LONG;
+	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
 	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
 
 	kfree(tag_index);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8d7e2f4151d0..60272141ff19 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -285,9 +285,6 @@ enum blk_queue_state {
 	Queue_up,
 };
 
-#define BLK_TAGS_PER_LONG	(sizeof(unsigned long) * 8)
-#define BLK_TAGS_MASK		(BLK_TAGS_PER_LONG - 1)
-
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
-- 
cgit v1.2.3-59-g8ed1b


From 55c888d6d09a0df236adfaf8ccf06ff5d0646775 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:56 -0700
Subject: [PATCH] timers fixes/improvements

This patch tries to solve following problems:

1. del_timer_sync() is racy. The timer can be fired again after
   del_timer_sync have checked all cpus and before it will recheck
   timer_pending().

2. It has scalability problems. All cpus are scanned to determine
   if the timer is running on that cpu.

   With this patch del_timer_sync is O(1) and no slower than plain
   del_timer(pending_timer), unless it has to actually wait for
   completion of the currently running timer.

   The only restriction is that the recurring timer should not use
   add_timer_on().

3. The timers are not serialized wrt to itself.

   If CPU_0 does mod_timer(jiffies+1) while the timer is currently
   running on CPU 1, it is quite possible that local interrupt on
   CPU_0 will start that timer before it finished on CPU_1.

4. The timers locking is suboptimal. __mod_timer() takes 3 locks
   at once and still requires wmb() in del_timer/run_timers.

   The new implementation takes 2 locks sequentially and does not
   need memory barriers.

Currently ->base != NULL means that the timer is pending. In that case
->base.lock is used to lock the timer. __mod_timer also takes timer->lock
because ->base can be == NULL.

This patch uses timer->entry.next != NULL as indication that the timer is
pending. So it does __list_del(), entry->next = NULL instead of list_del()
when the timer is deleted.

The ->base field is used for hashed locking only, it is initialized
in init_timer() which sets ->base = per_cpu(tvec_bases). When the
tvec_bases.lock is locked, it means that all timers which are tied
to this base via timer->base are locked, and the base itself is locked
too.

So __run_timers/migrate_timers can safely modify all timers which could
be found on ->tvX lists (pending timers).

When the timer's base is locked, and the timer removed from ->entry list
(which means that _run_timers/migrate_timers can't see this timer), it is
possible to set timer->base = NULL and drop the lock: the timer remains
locked.

This patch adds lock_timer_base() helper, which waits for ->base != NULL,
locks the ->base, and checks it is still the same.

__mod_timer() schedules the timer on the local CPU and changes it's base.
However, it does not lock both old and new bases at once. It locks the
timer via lock_timer_base(), deletes the timer, sets ->base = NULL, and
unlocks old base. Then __mod_timer() locks new_base, sets ->base = new_base,
and adds this timer. This simplifies the code, because AB-BA deadlock is not
possible. __mod_timer() also ensures that the timer's base is not changed
while the timer's handler is running on the old base.

__run_timers(), del_timer() do not change ->base anymore, they only clear
pending flag.

So del_timer_sync() can test timer->base->running_timer == timer to detect
whether it is running or not.

We don't need timer_list->lock anymore, this patch kills it.

We also don't need barriers. del_timer() and __run_timers() used smp_wmb()
before clearing timer's pending flag. It was needed because __mod_timer()
did not lock old_base if the timer is not pending, so __mod_timer()->list_add()
could race with del_timer()->list_del(). With this patch these functions are
serialized through base->lock.

One problem. TIMER_INITIALIZER can't use per_cpu(tvec_bases). So this patch
adds global

        struct timer_base_s {
                spinlock_t lock;
                struct timer_list *running_timer;
        } __init_timer_base;

which is used by TIMER_INITIALIZER. The corresponding fields in tvec_t_base_s
struct are replaced by struct timer_base_s t_base.

It is indeed ugly. But this can't have scalability problems. The global
__init_timer_base.lock is used only when __mod_timer() is called for the first
time AND the timer was compile time initialized. After that the timer migrates
to the local CPU.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Renaud Lienhart <renaud.lienhart@free.fr>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h |  30 ++---
 kernel/timer.c        | 328 ++++++++++++++++++++++++--------------------------
 2 files changed, 166 insertions(+), 192 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 90db1cc62ddd..2e78fedfc069 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -6,45 +6,33 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 
-struct tvec_t_base_s;
+struct timer_base_s;
 
 struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	spinlock_t lock;
 	unsigned long magic;
 
 	void (*function)(unsigned long);
 	unsigned long data;
 
-	struct tvec_t_base_s *base;
+	struct timer_base_s *base;
 };
 
 #define TIMER_MAGIC	0x4b87ad6e
 
+extern struct timer_base_s __init_timer_base;
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = NULL,					\
+		.base = &__init_timer_base,			\
 		.magic = TIMER_MAGIC,				\
-		.lock = SPIN_LOCK_UNLOCKED,			\
 	}
 
-/***
- * init_timer - initialize a timer.
- * @timer: the timer to be initialized
- *
- * init_timer() must be done to a timer prior calling *any* of the
- * other timer functions.
- */
-static inline void init_timer(struct timer_list * timer)
-{
-	timer->base = NULL;
-	timer->magic = TIMER_MAGIC;
-	spin_lock_init(&timer->lock);
-}
+void fastcall init_timer(struct timer_list * timer);
 
 /***
  * timer_pending - is a timer pending?
@@ -58,7 +46,7 @@ static inline void init_timer(struct timer_list * timer)
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->base != NULL;
+	return timer->entry.next != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
@@ -89,12 +77,12 @@ static inline void add_timer(struct timer_list * timer)
 
 #ifdef CONFIG_SMP
   extern int del_timer_sync(struct timer_list *timer);
-  extern int del_singleshot_timer_sync(struct timer_list *timer);
 #else
 # define del_timer_sync(t) del_timer(t)
-# define del_singleshot_timer_sync(t) del_timer(t)
 #endif
 
+#define del_singleshot_timer_sync(t) del_timer_sync(t)
+
 extern void init_timers(void);
 extern void run_local_timers(void);
 extern void it_real_fn(unsigned long);
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..8aadc62efd65 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
 #define TVN_MASK (TVN_SIZE - 1)
 #define TVR_MASK (TVR_SIZE - 1)
 
+struct timer_base_s {
+	spinlock_t lock;
+	struct timer_list *running_timer;
+};
+
 typedef struct tvec_s {
 	struct list_head vec[TVN_SIZE];
 } tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
 } tvec_root_t;
 
 struct tvec_t_base_s {
-	spinlock_t lock;
+	struct timer_base_s t_base;
 	unsigned long timer_jiffies;
-	struct timer_list *running_timer;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
 } ____cacheline_aligned_in_smp;
 
 typedef struct tvec_t_base_s tvec_base_t;
+static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
 
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
 #ifdef CONFIG_SMP
-	base->running_timer = timer;
+	base->t_base.running_timer = timer;
 #endif
 }
 
-/* Fake initialization */
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
-
 static void check_timer_failed(struct timer_list *timer)
 {
 	static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
 	/*
 	 * Now fix it up
 	 */
-	spin_lock_init(&timer->lock);
 	timer->magic = TIMER_MAGIC;
 }
 
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 	list_add_tail(&timer->entry, vec);
 }
 
+typedef struct timer_base_s timer_base_t;
+/*
+ * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
+ * at compile time, and we need timer->base to lock the timer.
+ */
+timer_base_t __init_timer_base
+	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
+EXPORT_SYMBOL(__init_timer_base);
+
+/***
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void fastcall init_timer(struct timer_list *timer)
+{
+	timer->entry.next = NULL;
+	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+	timer->magic = TIMER_MAGIC;
+}
+EXPORT_SYMBOL(init_timer);
+
+static inline void detach_timer(struct timer_list *timer,
+					int clear_pending)
+{
+	struct list_head *entry = &timer->entry;
+
+	__list_del(entry->prev, entry->next);
+	if (clear_pending)
+		entry->next = NULL;
+	entry->prev = LIST_POISON2;
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on ->tvX lists.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static timer_base_t *lock_timer_base(struct timer_list *timer,
+					unsigned long *flags)
+{
+	timer_base_t *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
 int __mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	tvec_base_t *old_base, *new_base;
+	timer_base_t *base;
+	tvec_base_t *new_base;
 	unsigned long flags;
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-
 	check_timer(timer);
 
-	spin_lock_irqsave(&timer->lock, flags);
+	base = lock_timer_base(timer, &flags);
+
+	if (timer_pending(timer)) {
+		detach_timer(timer, 0);
+		ret = 1;
+	}
+
 	new_base = &__get_cpu_var(tvec_bases);
-repeat:
-	old_base = timer->base;
 
-	/*
-	 * Prevent deadlocks via ordering by old_base < new_base.
-	 */
-	if (old_base && (new_base != old_base)) {
-		if (old_base < new_base) {
-			spin_lock(&new_base->lock);
-			spin_lock(&old_base->lock);
-		} else {
-			spin_lock(&old_base->lock);
-			spin_lock(&new_base->lock);
-		}
+	if (base != &new_base->t_base) {
 		/*
-		 * The timer base might have been cancelled while we were
-		 * trying to take the lock(s):
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * otherwise del_timer_sync() can't detect that the timer's
+		 * handler yet has not finished. This also guarantees that
+		 * the timer is serialized wrt itself.
 		 */
-		if (timer->base != old_base) {
-			spin_unlock(&new_base->lock);
-			spin_unlock(&old_base->lock);
-			goto repeat;
-		}
-	} else {
-		spin_lock(&new_base->lock);
-		if (timer->base != old_base) {
-			spin_unlock(&new_base->lock);
-			goto repeat;
+		if (unlikely(base->running_timer == timer)) {
+			/* The timer remains on a former base */
+			new_base = container_of(base, tvec_base_t, t_base);
+		} else {
+			/* See the comment in lock_timer_base() */
+			timer->base = NULL;
+			spin_unlock(&base->lock);
+			spin_lock(&new_base->t_base.lock);
+			timer->base = &new_base->t_base;
 		}
 	}
 
-	/*
-	 * Delete the previous timeout (if there was any), and install
-	 * the new one:
-	 */
-	if (old_base) {
-		list_del(&timer->entry);
-		ret = 1;
-	}
 	timer->expires = expires;
 	internal_add_timer(new_base, timer);
-	timer->base = new_base;
-
-	if (old_base && (new_base != old_base))
-		spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	spin_unlock_irqrestore(&timer->lock, flags);
+	spin_unlock_irqrestore(&new_base->t_base.lock, flags);
 
 	return ret;
 }
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
 {
 	tvec_base_t *base = &per_cpu(tvec_bases, cpu);
   	unsigned long flags;
-  
+
   	BUG_ON(timer_pending(timer) || !timer->function);
 
 	check_timer(timer);
 
-	spin_lock_irqsave(&base->lock, flags);
+	spin_lock_irqsave(&base->t_base.lock, flags);
+	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
-	timer->base = base;
-	spin_unlock_irqrestore(&base->lock, flags);
+	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 
 
@@ -295,27 +344,22 @@ EXPORT_SYMBOL(mod_timer);
  */
 int del_timer(struct timer_list *timer)
 {
+	timer_base_t *base;
 	unsigned long flags;
-	tvec_base_t *base;
+	int ret = 0;
 
 	check_timer(timer);
 
-repeat:
- 	base = timer->base;
-	if (!base)
-		return 0;
-	spin_lock_irqsave(&base->lock, flags);
-	if (base != timer->base) {
+	if (timer_pending(timer)) {
+		base = lock_timer_base(timer, &flags);
+		if (timer_pending(timer)) {
+			detach_timer(timer, 1);
+			ret = 1;
+		}
 		spin_unlock_irqrestore(&base->lock, flags);
-		goto repeat;
 	}
-	list_del(&timer->entry);
-	/* Need to make sure that anybody who sees a NULL base also sees the list ops */
-	smp_wmb();
-	timer->base = NULL;
-	spin_unlock_irqrestore(&base->lock, flags);
 
-	return 1;
+	return ret;
 }
 
 EXPORT_SYMBOL(del_timer);
@@ -332,72 +376,39 @@ EXPORT_SYMBOL(del_timer);
  * Synchronization rules: callers must prevent restarting of the timer,
  * otherwise this function is meaningless. It must not be called from
  * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler.  Upon exit the timer is not queued and
- * the handler is not running on any CPU.
+ * completion of the timer's handler. The timer's handler must not call
+ * add_timer_on(). Upon exit the timer is not queued and the handler is
+ * not running on any CPU.
  *
  * The function returns whether it has deactivated a pending timer or not.
- *
- * del_timer_sync() is slow and complicated because it copes with timer
- * handlers which re-arm the timer (periodic timers).  If the timer handler
- * is known to not do this (a single shot timer) then use
- * del_singleshot_timer_sync() instead.
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	tvec_base_t *base;
-	int i, ret = 0;
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
 
 	check_timer(timer);
 
-del_again:
-	ret += del_timer(timer);
+	do {
+		base = lock_timer_base(timer, &flags);
 
-	for_each_online_cpu(i) {
-		base = &per_cpu(tvec_bases, i);
-		if (base->running_timer == timer) {
-			while (base->running_timer == timer) {
-				cpu_relax();
-				preempt_check_resched();
-			}
-			break;
+		if (base->running_timer == timer)
+			goto unlock;
+
+		ret = 0;
+		if (timer_pending(timer)) {
+			detach_timer(timer, 1);
+			ret = 1;
 		}
-	}
-	smp_rmb();
-	if (timer_pending(timer))
-		goto del_again;
+unlock:
+		spin_unlock_irqrestore(&base->lock, flags);
+	} while (ret < 0);
 
 	return ret;
 }
-EXPORT_SYMBOL(del_timer_sync);
 
-/***
- * del_singleshot_timer_sync - deactivate a non-recursive timer
- * @timer: the timer to be deactivated
- *
- * This function is an optimization of del_timer_sync for the case where the
- * caller can guarantee the timer does not reschedule itself in its timer
- * function.
- *
- * Synchronization rules: callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which wold prevent
- * completion of the timer's handler.  Upon exit the timer is not queued and
- * the handler is not running on any CPU.
- *
- * The function returns whether it has deactivated a pending timer or not.
- */
-int del_singleshot_timer_sync(struct timer_list *timer)
-{
-	int ret = del_timer(timer);
-
-	if (!ret) {
-		ret = del_timer_sync(timer);
-		BUG_ON(ret);
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL(del_singleshot_timer_sync);
+EXPORT_SYMBOL(del_timer_sync);
 #endif
 
 static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +426,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 		struct timer_list *tmp;
 
 		tmp = list_entry(curr, struct timer_list, entry);
-		BUG_ON(tmp->base != base);
+		BUG_ON(tmp->base != &base->t_base);
 		curr = curr->next;
 		internal_add_timer(base, tmp);
 	}
@@ -437,7 +448,7 @@ static inline void __run_timers(tvec_base_t *base)
 {
 	struct timer_list *timer;
 
-	spin_lock_irq(&base->lock);
+	spin_lock_irq(&base->t_base.lock);
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
@@ -453,8 +464,7 @@ static inline void __run_timers(tvec_base_t *base)
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies; 
 		list_splice_init(base->tv1.vec + index, &work_list);
-repeat:
-		if (!list_empty(head)) {
+		while (!list_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 
@@ -462,11 +472,9 @@ repeat:
  			fn = timer->function;
  			data = timer->data;
 
-			list_del(&timer->entry);
 			set_running_timer(base, timer);
-			smp_wmb();
-			timer->base = NULL;
-			spin_unlock_irq(&base->lock);
+			detach_timer(timer, 1);
+			spin_unlock_irq(&base->t_base.lock);
 			{
 				u32 preempt_count = preempt_count();
 				fn(data);
@@ -475,12 +483,11 @@ repeat:
 					BUG();
 				}
 			}
-			spin_lock_irq(&base->lock);
-			goto repeat;
+			spin_lock_irq(&base->t_base.lock);
 		}
 	}
 	set_running_timer(base, NULL);
-	spin_unlock_irq(&base->lock);
+	spin_unlock_irq(&base->t_base.lock);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +506,7 @@ unsigned long next_timer_interrupt(void)
 	int i, j;
 
 	base = &__get_cpu_var(tvec_bases);
-	spin_lock(&base->lock);
+	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
 	list = 0;
 
@@ -547,7 +554,7 @@ found:
 				expires = nte->expires;
 		}
 	}
-	spin_unlock(&base->lock);
+	spin_unlock(&base->t_base.lock);
 	return expires;
 }
 #endif
@@ -1286,9 +1293,9 @@ static void __devinit init_timers_cpu(int cpu)
 {
 	int j;
 	tvec_base_t *base;
-       
+
 	base = &per_cpu(tvec_bases, cpu);
-	spin_lock_init(&base->lock);
+	spin_lock_init(&base->t_base.lock);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1309,16 @@ static void __devinit init_timers_cpu(int cpu)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
 {
 	struct timer_list *timer;
 
 	while (!list_empty(head)) {
 		timer = list_entry(head->next, struct timer_list, entry);
-		/* We're locking backwards from __mod_timer order here,
-		   beware deadlock. */
-		if (!spin_trylock(&timer->lock))
-			return 0;
-		list_del(&timer->entry);
+		detach_timer(timer, 0);
+		timer->base = &new_base->t_base;
 		internal_add_timer(new_base, timer);
-		timer->base = new_base;
-		spin_unlock(&timer->lock);
 	}
-	return 1;
 }
 
 static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1332,24 @@ static void __devinit migrate_timers(int cpu)
 	new_base = &get_cpu_var(tvec_bases);
 
 	local_irq_disable();
-again:
-	/* Prevent deadlocks via ordering by old_base < new_base. */
-	if (old_base < new_base) {
-		spin_lock(&new_base->lock);
-		spin_lock(&old_base->lock);
-	} else {
-		spin_lock(&old_base->lock);
-		spin_lock(&new_base->lock);
-	}
+	spin_lock(&new_base->t_base.lock);
+	spin_lock(&old_base->t_base.lock);
 
-	if (old_base->running_timer)
+	if (old_base->t_base.running_timer)
 		BUG();
 	for (i = 0; i < TVR_SIZE; i++)
-		if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
-			goto unlock_again;
-	for (i = 0; i < TVN_SIZE; i++)
-		if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv3.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv4.vec + i)
-		    || !migrate_timer_list(new_base, old_base->tv5.vec + i))
-			goto unlock_again;
-	spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
+		migrate_timer_list(new_base, old_base->tv1.vec + i);
+	for (i = 0; i < TVN_SIZE; i++) {
+		migrate_timer_list(new_base, old_base->tv2.vec + i);
+		migrate_timer_list(new_base, old_base->tv3.vec + i);
+		migrate_timer_list(new_base, old_base->tv4.vec + i);
+		migrate_timer_list(new_base, old_base->tv5.vec + i);
+	}
+
+	spin_unlock(&old_base->t_base.lock);
+	spin_unlock(&new_base->t_base.lock);
 	local_irq_enable();
 	put_cpu_var(tvec_bases);
-	return;
-
-unlock_again:
-	/* Avoid deadlock with __mod_timer, by backing off. */
-	spin_unlock(&old_base->lock);
-	spin_unlock(&new_base->lock);
-	cpu_relax();
-	goto again;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.2.3-59-g8ed1b


From fd450b7318b75343fd76b3d95416853e34e72c95 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 23 Jun 2005 00:08:59 -0700
Subject: [PATCH] timers: introduce try_to_del_timer_sync()

This patch splits del_timer_sync() into 2 functions.  The new one,
try_to_del_timer_sync(), returns -1 when it hits executing timer.

It can be used in interrupt context, or when the caller hold locks which
can prevent completion of the timer's handler.

NOTE.  Currently it can't be used in interrupt context in UP case, because
->running_timer is used only with CONFIG_SMP.

Should the need arise, it is possible to kill #ifdef CONFIG_SMP in
set_running_timer(), it is cheap.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timer.h |  4 +++-
 kernel/timer.c        | 53 ++++++++++++++++++++++++++++++++-------------------
 2 files changed, 36 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 2e78fedfc069..221f81ac2002 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -76,9 +76,11 @@ static inline void add_timer(struct timer_list * timer)
 }
 
 #ifdef CONFIG_SMP
+  extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
-# define del_timer_sync(t) del_timer(t)
+# define try_to_del_timer_sync(t)	del_timer(t)
+# define del_timer_sync(t)		del_timer(t)
 #endif
 
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
diff --git a/kernel/timer.c b/kernel/timer.c
index 8aadc62efd65..1f986c16d89f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -365,6 +365,34 @@ int del_timer(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
+/*
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int try_to_del_timer_sync(struct timer_list *timer)
+{
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer)) {
+		detach_timer(timer, 1);
+		ret = 1;
+	}
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -384,28 +412,13 @@ EXPORT_SYMBOL(del_timer);
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	timer_base_t *base;
-	unsigned long flags;
-	int ret = -1;
-
 	check_timer(timer);
 
-	do {
-		base = lock_timer_base(timer, &flags);
-
-		if (base->running_timer == timer)
-			goto unlock;
-
-		ret = 0;
-		if (timer_pending(timer)) {
-			detach_timer(timer, 1);
-			ret = 1;
-		}
-unlock:
-		spin_unlock_irqrestore(&base->lock, flags);
-	} while (ret < 0);
-
-	return ret;
+	for (;;) {
+		int ret = try_to_del_timer_sync(timer);
+		if (ret >= 0)
+			return ret;
+	}
 }
 
 EXPORT_SYMBOL(del_timer_sync);
-- 
cgit v1.2.3-59-g8ed1b


From 991114c6fa6a21d1fa4d544abe78592352860c82 Mon Sep 17 00:00:00 2001
From: Alexander Viro <aviro@redhat.com>
Date: Thu, 23 Jun 2005 00:09:01 -0700
Subject: [PATCH] fix for prune_icache()/forced final iput() races

Based on analysis and a patch from Russ Weight <rweight@us.ibm.com>

There is a race condition that can occur if an inode is allocated and then
released (using iput) during the ->fill_super functions.  The race
condition is between kswapd and mount.

For most filesystems this can only happen in an error path when kswapd is
running concurrently.  For isofs, however, the error can occur in a more
common code path (which is how the bug was found).

The logic here is "we want final iput() to free inode *now* instead of
letting it sit in cache if fs is going down or had not quite come up".  The
problem is with kswapd seeing such inodes in the middle of being killed and
happily taking over.

The clean solution would be to tell kswapd to leave those inodes alone and
let our final iput deal with them.  I.e.  add a new flag
(I_FORCED_FREEING), set it before write_inode_now() there and make
prune_icache() leave those alone.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         | 16 ++++++++++------
 include/linux/fs.h |  1 +
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 801fe7f36280..1f9a3a2b89bc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -500,7 +500,7 @@ repeat:
 			continue;
 		if (!test(inode, data))
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -525,7 +525,7 @@ repeat:
 			continue;
 		if (inode->i_sb != sb)
 			continue;
-		if (inode->i_state & (I_FREEING|I_CLEAR)) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE)) {
 			__wait_on_freeing_inode(inode);
 			goto repeat;
 		}
@@ -727,7 +727,7 @@ EXPORT_SYMBOL(iunique);
 struct inode *igrab(struct inode *inode)
 {
 	spin_lock(&inode_lock);
-	if (!(inode->i_state & I_FREEING))
+	if (!(inode->i_state & (I_FREEING|I_WILL_FREE)))
 		__iget(inode);
 	else
 		/*
@@ -1024,17 +1024,21 @@ static void generic_forget_inode(struct inode *inode)
 		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
 			list_move(&inode->i_list, &inode_unused);
 		inodes_stat.nr_unused++;
-		spin_unlock(&inode_lock);
-		if (!sb || (sb->s_flags & MS_ACTIVE))
+		if (!sb || (sb->s_flags & MS_ACTIVE)) {
+			spin_unlock(&inode_lock);
 			return;
+		}
+		inode->i_state |= I_WILL_FREE;
+		spin_unlock(&inode_lock);
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
+		inode->i_state &= ~I_WILL_FREE;
 		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
 	}
 	list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
-	inode->i_state|=I_FREEING;
+	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
 	if (inode->i_data.nrpages)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5a8db00df29..3622e952e98c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1025,6 +1025,7 @@ struct super_operations {
 #define I_FREEING		16
 #define I_CLEAR			32
 #define I_NEW			64
+#define I_WILL_FREE		128
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 
-- 
cgit v1.2.3-59-g8ed1b


From 543537bd922692bc978e2e356fcd8bfc9c2ee7d5 Mon Sep 17 00:00:00 2001
From: Paulo Marques <pmarques@grupopie.com>
Date: Thu, 23 Jun 2005 00:09:02 -0700
Subject: [PATCH] create a kstrdup library function

This patch creates a new kstrdup library function and changes the "local"
implementations in several places to use this function.

Most of the changes come from the sound and net subsystems.  The sound part
had already been acknowledged by Takashi Iwai and the net part by David S.
Miller.

I left UML alone for now because I would need more time to read the code
carefully before making changes there.

Signed-off-by: Paulo Marques <pmarques@grupopie.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-ioctl.c         | 14 +++-----------
 drivers/parport/probe.c       | 18 +++++-------------
 include/linux/netdevice.h     |  4 ----
 include/linux/string.h        |  2 ++
 include/sound/core.h          |  3 ++-
 mm/slab.c                     | 24 ++++++++++++++++++++++++
 net/core/neighbour.c          |  3 ++-
 net/core/sysctl_net_core.c    | 15 ---------------
 net/ipv4/devinet.c            |  2 +-
 net/ipv6/addrconf.c           |  3 ++-
 net/sunrpc/svcauth_unix.c     | 11 ++---------
 sound/core/info.c             |  3 ++-
 sound/core/info_oss.c         |  3 ++-
 sound/core/memory.c           | 41 ++++++++++++++---------------------------
 sound/core/oss/mixer_oss.c    |  3 ++-
 sound/core/oss/pcm_oss.c      |  3 ++-
 sound/core/sound.c            |  2 +-
 sound/core/timer.c            |  3 ++-
 sound/isa/gus/gus_mem.c       |  7 ++++---
 sound/pci/hda/patch_realtek.c |  2 +-
 sound/synth/emux/emux.c       |  3 ++-
 21 files changed, 75 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index ee3c869d9701..200a0688f717 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -122,14 +122,6 @@ static struct hash_cell *__get_uuid_cell(const char *str)
 /*-----------------------------------------------------------------
  * Inserting, removing and renaming a device.
  *---------------------------------------------------------------*/
-static inline char *kstrdup(const char *str)
-{
-	char *r = kmalloc(strlen(str) + 1, GFP_KERNEL);
-	if (r)
-		strcpy(r, str);
-	return r;
-}
-
 static struct hash_cell *alloc_cell(const char *name, const char *uuid,
 				    struct mapped_device *md)
 {
@@ -139,7 +131,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
 	if (!hc)
 		return NULL;
 
-	hc->name = kstrdup(name);
+	hc->name = kstrdup(name, GFP_KERNEL);
 	if (!hc->name) {
 		kfree(hc);
 		return NULL;
@@ -149,7 +141,7 @@ static struct hash_cell *alloc_cell(const char *name, const char *uuid,
 		hc->uuid = NULL;
 
 	else {
-		hc->uuid = kstrdup(uuid);
+		hc->uuid = kstrdup(uuid, GFP_KERNEL);
 		if (!hc->uuid) {
 			kfree(hc->name);
 			kfree(hc);
@@ -273,7 +265,7 @@ static int dm_hash_rename(const char *old, const char *new)
 	/*
 	 * duplicate new.
 	 */
-	new_name = kstrdup(new);
+	new_name = kstrdup(new, GFP_KERNEL);
 	if (!new_name)
 		return -ENOMEM;
 
diff --git a/drivers/parport/probe.c b/drivers/parport/probe.c
index c94963145e17..6e6f42d01e64 100644
--- a/drivers/parport/probe.c
+++ b/drivers/parport/probe.c
@@ -48,14 +48,6 @@ static void pretty_print(struct parport *port, int device)
 	printk("\n");
 }
 
-static char *strdup(char *str)
-{
-	int n = strlen(str)+1;
-	char *s = kmalloc(n, GFP_KERNEL);
-	if (!s) return NULL;
-	return strcpy(s, str);
-}
-
 static void parse_data(struct parport *port, int device, char *str)
 {
 	char *txt = kmalloc(strlen(str)+1, GFP_KERNEL);
@@ -88,16 +80,16 @@ static void parse_data(struct parport *port, int device, char *str)
 			if (!strcmp(p, "MFG") || !strcmp(p, "MANUFACTURER")) {
 				if (info->mfr)
 					kfree (info->mfr);
-				info->mfr = strdup(sep);
+				info->mfr = kstrdup(sep, GFP_KERNEL);
 			} else if (!strcmp(p, "MDL") || !strcmp(p, "MODEL")) {
 				if (info->model)
 					kfree (info->model);
-				info->model = strdup(sep);
+				info->model = kstrdup(sep, GFP_KERNEL);
 			} else if (!strcmp(p, "CLS") || !strcmp(p, "CLASS")) {
 				int i;
 				if (info->class_name)
 					kfree (info->class_name);
-				info->class_name = strdup(sep);
+				info->class_name = kstrdup(sep, GFP_KERNEL);
 				for (u = sep; *u; u++)
 					*u = toupper(*u);
 				for (i = 0; classes[i].token; i++) {
@@ -112,7 +104,7 @@ static void parse_data(struct parport *port, int device, char *str)
 				   !strcmp(p, "COMMAND SET")) {
 				if (info->cmdset)
 					kfree (info->cmdset);
-				info->cmdset = strdup(sep);
+				info->cmdset = kstrdup(sep, GFP_KERNEL);
 				/* if it speaks printer language, it's
 				   probably a printer */
 				if (strstr(sep, "PJL") || strstr(sep, "PCL"))
@@ -120,7 +112,7 @@ static void parse_data(struct parport *port, int device, char *str)
 			} else if (!strcmp(p, "DES") || !strcmp(p, "DESCRIPTION")) {
 				if (info->description)
 					kfree (info->description);
-				info->description = strdup(sep);
+				info->description = kstrdup(sep, GFP_KERNEL);
 			}
 		}
 	rock_on:
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d6afd440cf7b..d89816ad642f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -925,10 +925,6 @@ extern int skb_checksum_help(struct sk_buff *skb, int inward);
 extern void		net_enable_timestamp(void);
 extern void		net_disable_timestamp(void);
 
-#ifdef CONFIG_SYSCTL
-extern char *net_sysctl_strdup(const char *s);
-#endif
-
 #endif /* __KERNEL__ */
 
 #endif	/* _LINUX_DEV_H */
diff --git a/include/linux/string.h b/include/linux/string.h
index b9fc59469956..93994c613095 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -88,6 +88,8 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
+extern char *kstrdup(const char *s, int gfp);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/sound/core.h b/include/sound/core.h
index 9117c23e3a01..f8c4ef0aa352 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -292,6 +292,7 @@ void *snd_hidden_kcalloc(size_t n, size_t size, int flags);
 void snd_hidden_kfree(const void *obj);
 void *snd_hidden_vmalloc(unsigned long size);
 void snd_hidden_vfree(void *obj);
+char *snd_hidden_kstrdup(const char *s, int flags);
 #define kmalloc(size, flags) snd_hidden_kmalloc(size, flags)
 #define kcalloc(n, size, flags) snd_hidden_kcalloc(n, size, flags)
 #define kfree(obj) snd_hidden_kfree(obj)
@@ -301,6 +302,7 @@ void snd_hidden_vfree(void *obj);
 #define vmalloc_nocheck(size) snd_wrapper_vmalloc(size)
 #define kfree_nocheck(obj) snd_wrapper_kfree(obj)
 #define vfree_nocheck(obj) snd_wrapper_vfree(obj)
+#define kstrdup(s, flags)  snd_hidden_kstrdup(s, flags)
 #else
 #define snd_memory_init() /*NOP*/
 #define snd_memory_done() /*NOP*/
@@ -311,7 +313,6 @@ void snd_hidden_vfree(void *obj);
 #define kfree_nocheck(obj) kfree(obj)
 #define vfree_nocheck(obj) vfree(obj)
 #endif
-char *snd_kmalloc_strdup(const char *string, int flags);
 int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count);
 int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count);
 
diff --git a/mm/slab.c b/mm/slab.c
index 93cbbbb39f42..122d031baab2 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -92,6 +92,7 @@
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
+#include	<linux/string.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -3082,3 +3083,26 @@ unsigned int ksize(const void *objp)
 
 	return size;
 }
+
+
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, int gfp)
+{
+	size_t len;
+	char *buf;
+
+	if (!s)
+		return NULL;
+
+	len = strlen(s) + 1;
+	buf = kmalloc(len, gfp);
+	if (buf)
+		memcpy(buf, s, len);
+	return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index f6bdcad47da6..851eb927ed97 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -32,6 +32,7 @@
 #include <net/sock.h>
 #include <linux/rtnetlink.h>
 #include <linux/random.h>
+#include <linux/string.h>
 
 #define NEIGH_DEBUG 1
 
@@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
 		t->neigh_vars[17].extra1 = dev;
 	}
 
-	dev_name = net_sysctl_strdup(dev_name_source);
+	dev_name = kstrdup(dev_name_source, GFP_KERNEL);
 	if (!dev_name) {
 		err = -ENOBUFS;
 		goto free;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8be646cb191..880a88815211 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -35,19 +35,6 @@ extern int sysctl_somaxconn;
 extern char sysctl_divert_version[];
 #endif /* CONFIG_NET_DIVERT */
 
-/*
- * This strdup() is used for creating copies of network 
- * device names to be handed over to sysctl.
- */
- 
-char *net_sysctl_strdup(const char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 ctl_table core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -177,6 +164,4 @@ ctl_table core_table[] = {
 	{ .ctl_name = 0 }
 };
 
-EXPORT_SYMBOL(net_sysctl_strdup);
-
 #endif
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 650dcb12d9a1..d8a10e3dd77d 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev,
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 14f5c53235fe..a54d4ef3fd35 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -57,6 +57,7 @@
 #endif
 #include <linux/delay.h>
 #include <linux/notifier.h>
+#include <linux/string.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf
 	 * by sysctl and we wouldn't want anyone to change it under our feet
 	 * (see SIOCSIFNAME).
 	 */	
-	dev_name = net_sysctl_strdup(dev_name);
+	dev_name = kstrdup(dev_name, GFP_KERNEL);
 	if (!dev_name)
 	    goto free;
 
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 2b99b4028d31..d6baf6fdf8a9 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -8,6 +8,7 @@
 #include <linux/err.h>
 #include <linux/seq_file.h>
 #include <linux/hash.h>
+#include <linux/string.h>
 
 #define RPCDBG_FACILITY	RPCDBG_AUTH
 
@@ -20,14 +21,6 @@
  */
 
 
-static char *strdup(char *s)
-{
-	char *rv = kmalloc(strlen(s)+1, GFP_KERNEL);
-	if (rv)
-		strcpy(rv, s);
-	return rv;
-}
-
 struct unix_domain {
 	struct auth_domain	h;
 	int	addr_changes;
@@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name)
 	if (new == NULL)
 		return NULL;
 	cache_init(&new->h.h);
-	new->h.name = strdup(name);
+	new->h.name = kstrdup(name, GFP_KERNEL);
 	new->h.flavour = RPC_AUTH_UNIX;
 	new->addr_changes = 0;
 	new->h.h.expiry_time = NEVER;
diff --git a/sound/core/info.c b/sound/core/info.c
index 31faffe01cb0..5e122bbe7c92 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -24,6 +24,7 @@
 #include <linux/vmalloc.h>
 #include <linux/time.h>
 #include <linux/smp_lock.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/info.h>
@@ -754,7 +755,7 @@ static snd_info_entry_t *snd_info_create_entry(const char *name)
 	entry = kcalloc(1, sizeof(*entry), GFP_KERNEL);
 	if (entry == NULL)
 		return NULL;
-	entry->name = snd_kmalloc_strdup(name, GFP_KERNEL);
+	entry->name = kstrdup(name, GFP_KERNEL);
 	if (entry->name == NULL) {
 		kfree(entry);
 		return NULL;
diff --git a/sound/core/info_oss.c b/sound/core/info_oss.c
index f9e4ce443454..12107968d402 100644
--- a/sound/core/info_oss.c
+++ b/sound/core/info_oss.c
@@ -22,6 +22,7 @@
 #include <sound/driver.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/info.h>
@@ -51,7 +52,7 @@ int snd_oss_info_register(int dev, int num, char *string)
 			x = NULL;
 		}
 	} else {
-		x = snd_kmalloc_strdup(string, GFP_KERNEL);
+		x = kstrdup(string, GFP_KERNEL);
 		if (x == NULL) {
 			up(&strings);
 			return -ENOMEM;
diff --git a/sound/core/memory.c b/sound/core/memory.c
index 20860fec9364..c1fb28e84330 100644
--- a/sound/core/memory.c
+++ b/sound/core/memory.c
@@ -184,6 +184,20 @@ void snd_hidden_vfree(void *obj)
 	snd_wrapper_vfree(obj);
 }
 
+char *snd_hidden_kstrdup(const char *s, int flags)
+{
+	int len;
+	char *buf;
+
+	if (!s) return NULL;
+
+	len = strlen(s) + 1;
+	buf = _snd_kmalloc(len, flags);
+	if (buf)
+		memcpy(buf, s, len);
+	return buf;
+}
+
 static void snd_memory_info_read(snd_info_entry_t *entry, snd_info_buffer_t * buffer)
 {
 	snd_iprintf(buffer, "kmalloc: %li bytes\n", snd_alloc_kmalloc);
@@ -214,35 +228,8 @@ int __exit snd_memory_info_done(void)
 	return 0;
 }
 
-#else
-
-#define _snd_kmalloc kmalloc
-
 #endif /* CONFIG_SND_DEBUG_MEMORY */
 
-/**
- * snd_kmalloc_strdup - copy the string
- * @string: the original string
- * @flags: allocation conditions, GFP_XXX
- *
- * Allocates a memory chunk via kmalloc() and copies the string to it.
- *
- * Returns the pointer, or NULL if no enoguh memory.
- */
-char *snd_kmalloc_strdup(const char *string, int flags)
-{
-	size_t len;
-	char *ptr;
-
-	if (!string)
-		return NULL;
-	len = strlen(string) + 1;
-	ptr = _snd_kmalloc(len, flags);
-	if (ptr)
-		memcpy(ptr, string, len);
-	return ptr;
-}
-
 /**
  * copy_to_user_fromio - copy data from mmio-space to user-space
  * @dst: the destination pointer on user-space
diff --git a/sound/core/oss/mixer_oss.c b/sound/core/oss/mixer_oss.c
index 98ed9a9f0da6..98fc0766f885 100644
--- a/sound/core/oss/mixer_oss.c
+++ b/sound/core/oss/mixer_oss.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/control.h>
@@ -1137,7 +1138,7 @@ static void snd_mixer_oss_proc_write(snd_info_entry_t *entry,
 			goto __unlock;
 		}
 		tbl->oss_id = ch;
-		tbl->name = snd_kmalloc_strdup(str, GFP_KERNEL);
+		tbl->name = kstrdup(str, GFP_KERNEL);
 		if (! tbl->name) {
 			kfree(tbl);
 			goto __unlock;
diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c
index cab30977e7c0..de7444c586f9 100644
--- a/sound/core/oss/pcm_oss.c
+++ b/sound/core/oss/pcm_oss.c
@@ -33,6 +33,7 @@
 #include <linux/time.h>
 #include <linux/vmalloc.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/minors.h>
 #include <sound/pcm.h>
@@ -2360,7 +2361,7 @@ static void snd_pcm_oss_proc_write(snd_info_entry_t *entry,
 					for (setup1 = pstr->oss.setup_list; setup1->next; setup1 = setup1->next);
 					setup1->next = setup;
 				}
-				template.task_name = snd_kmalloc_strdup(task_name, GFP_KERNEL);
+				template.task_name = kstrdup(task_name, GFP_KERNEL);
 			} else {
 				buffer->error = -ENOMEM;
 			}
diff --git a/sound/core/sound.c b/sound/core/sound.c
index 0815fadeb3ec..7612884f530b 100644
--- a/sound/core/sound.c
+++ b/sound/core/sound.c
@@ -399,8 +399,8 @@ EXPORT_SYMBOL(snd_hidden_kcalloc);
 EXPORT_SYMBOL(snd_hidden_kfree);
 EXPORT_SYMBOL(snd_hidden_vmalloc);
 EXPORT_SYMBOL(snd_hidden_vfree);
+EXPORT_SYMBOL(snd_hidden_kstrdup);
 #endif
-EXPORT_SYMBOL(snd_kmalloc_strdup);
 EXPORT_SYMBOL(copy_to_user_fromio);
 EXPORT_SYMBOL(copy_from_user_toio);
   /* init.c */
diff --git a/sound/core/timer.c b/sound/core/timer.c
index b498e5482d77..cfaccd415b3b 100644
--- a/sound/core/timer.c
+++ b/sound/core/timer.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/timer.h>
 #include <sound/control.h>
@@ -100,7 +101,7 @@ static snd_timer_instance_t *snd_timer_instance_new(char *owner, snd_timer_t *ti
 	timeri = kcalloc(1, sizeof(*timeri), GFP_KERNEL);
 	if (timeri == NULL)
 		return NULL;
-	timeri->owner = snd_kmalloc_strdup(owner, GFP_KERNEL);
+	timeri->owner = kstrdup(owner, GFP_KERNEL);
 	if (! timeri->owner) {
 		kfree(timeri);
 		return NULL;
diff --git a/sound/isa/gus/gus_mem.c b/sound/isa/gus/gus_mem.c
index 609838e8ef67..5eb766dd564b 100644
--- a/sound/isa/gus/gus_mem.c
+++ b/sound/isa/gus/gus_mem.c
@@ -21,6 +21,7 @@
 
 #include <sound/driver.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/gus.h>
 #include <sound/info.h>
@@ -213,7 +214,7 @@ snd_gf1_mem_block_t *snd_gf1_mem_alloc(snd_gf1_mem_t * alloc, int owner,
 	if (share_id != NULL)
 		memcpy(&block.share_id, share_id, sizeof(block.share_id));
 	block.owner = owner;
-	block.name = snd_kmalloc_strdup(name, GFP_KERNEL);
+	block.name = kstrdup(name, GFP_KERNEL);
 	nblock = snd_gf1_mem_xalloc(alloc, &block);
 	snd_gf1_mem_lock(alloc, 1);
 	return nblock;
@@ -253,13 +254,13 @@ int snd_gf1_mem_init(snd_gus_card_t * gus)
 	if (gus->gf1.enh_mode) {
 		block.ptr = 0;
 		block.size = 1024;
-		block.name = snd_kmalloc_strdup("InterWave LFOs", GFP_KERNEL);
+		block.name = kstrdup("InterWave LFOs", GFP_KERNEL);
 		if (snd_gf1_mem_xalloc(alloc, &block) == NULL)
 			return -ENOMEM;
 	}
 	block.ptr = gus->gf1.default_voice_address;
 	block.size = 4;
-	block.name = snd_kmalloc_strdup("Voice default (NULL's)", GFP_KERNEL);
+	block.name = kstrdup("Voice default (NULL's)", GFP_KERNEL);
 	if (snd_gf1_mem_xalloc(alloc, &block) == NULL)
 		return -ENOMEM;
 #ifdef CONFIG_SND_DEBUG
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 9edd558d6bd3..bab89843d850 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -1781,7 +1781,7 @@ static int add_control(struct alc_spec *spec, int type, const char *name, unsign
 
 	knew = &spec->kctl_alloc[spec->num_kctl_used];
 	*knew = alc880_control_templates[type];
-	knew->name = snd_kmalloc_strdup(name, GFP_KERNEL);
+	knew->name = kstrdup(name, GFP_KERNEL);
 	if (! knew->name)
 		return -ENOMEM;
 	knew->private_value = val;
diff --git a/sound/synth/emux/emux.c b/sound/synth/emux/emux.c
index 16f3b461627a..60d0b2c66698 100644
--- a/sound/synth/emux/emux.c
+++ b/sound/synth/emux/emux.c
@@ -22,6 +22,7 @@
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/string.h>
 #include <sound/core.h>
 #include <sound/emux_synth.h>
 #include <linux/init.h>
@@ -76,7 +77,7 @@ int snd_emux_register(snd_emux_t *emu, snd_card_t *card, int index, char *name)
 	snd_assert(name != NULL, return -EINVAL);
 
 	emu->card = card;
-	emu->name = snd_kmalloc_strdup(name, GFP_KERNEL);
+	emu->name = kstrdup(name, GFP_KERNEL);
 	emu->voices = kcalloc(emu->max_voices, sizeof(snd_emux_voice_t), GFP_KERNEL);
 	if (emu->voices == NULL)
 		return -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 35a82d1a53e1a9ad54efafcc940f9335beaed5c3 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 23 Jun 2005 00:09:06 -0700
Subject: [PATCH] optimise loop driver a bit

Looks like locking can be optimised quite a lot.  Increase lock widths
slightly so lo_lock is taken fewer times per request.  Also it was quite
trivial to cover lo_pending with that lock, and remove the atomic
requirement.  This also makes memory ordering explicitly correct, which is
nice (not that I particularly saw any mem ordering bugs).

Test was reading 4 250MB files in parallel on ext2-on-tmpfs filesystem (1K
block size, 4K page size).  System is 2 socket Xeon with HT (4 thread).

intel:/home/npiggin# umount /dev/loop0 ; mount /dev/loop0 /mnt/loop ; /usr/bin/time ./mtloop.sh

Before:
0.24user 5.51system 0:02.84elapsed 202%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.52system 0:02.88elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.57system 0:02.89elapsed 198%CPU (0avgtext+0avgdata 0maxresident)k
0.22user 5.51system 0:02.90elapsed 197%CPU (0avgtext+0avgdata 0maxresident)k
0.19user 5.44system 0:02.91elapsed 193%CPU (0avgtext+0avgdata 0maxresident)k

After:
0.07user 2.34system 0:01.68elapsed 143%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.37system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.39system 0:01.68elapsed 145%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.36system 0:01.68elapsed 144%CPU (0avgtext+0avgdata 0maxresident)k
0.06user 2.42system 0:01.68elapsed 147%CPU (0avgtext+0avgdata 0maxresident)k

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/loop.c | 81 ++++++++++++++++++++++++----------------------------
 include/linux/loop.h |  2 +-
 2 files changed, 39 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6f011d0d8e97..b35e08876dd4 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -472,17 +472,11 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&lo->lo_lock, flags);
 	if (lo->lo_biotail) {
 		lo->lo_biotail->bi_next = bio;
 		lo->lo_biotail = bio;
 	} else
 		lo->lo_bio = lo->lo_biotail = bio;
-	spin_unlock_irqrestore(&lo->lo_lock, flags);
-
-	up(&lo->lo_bh_mutex);
 }
 
 /*
@@ -492,14 +486,12 @@ static struct bio *loop_get_bio(struct loop_device *lo)
 {
 	struct bio *bio;
 
-	spin_lock_irq(&lo->lo_lock);
 	if ((bio = lo->lo_bio)) {
 		if (bio == lo->lo_biotail)
 			lo->lo_biotail = NULL;
 		lo->lo_bio = bio->bi_next;
 		bio->bi_next = NULL;
 	}
-	spin_unlock_irq(&lo->lo_lock);
 
 	return bio;
 }
@@ -509,35 +501,28 @@ static int loop_make_request(request_queue_t *q, struct bio *old_bio)
 	struct loop_device *lo = q->queuedata;
 	int rw = bio_rw(old_bio);
 
-	if (!lo)
-		goto out;
+	if (rw == READA)
+		rw = READ;
+
+	BUG_ON(!lo || (rw != READ && rw != WRITE));
 
 	spin_lock_irq(&lo->lo_lock);
 	if (lo->lo_state != Lo_bound)
-		goto inactive;
-	atomic_inc(&lo->lo_pending);
-	spin_unlock_irq(&lo->lo_lock);
-
-	if (rw == WRITE) {
-		if (lo->lo_flags & LO_FLAGS_READ_ONLY)
-			goto err;
-	} else if (rw == READA) {
-		rw = READ;
-	} else if (rw != READ) {
-		printk(KERN_ERR "loop: unknown command (%x)\n", rw);
-		goto err;
-	}
+		goto out;
+	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
+		goto out;
+	lo->lo_pending++;
 	loop_add_bio(lo, old_bio);
+	spin_unlock_irq(&lo->lo_lock);
+	up(&lo->lo_bh_mutex);
 	return 0;
-err:
-	if (atomic_dec_and_test(&lo->lo_pending))
-		up(&lo->lo_bh_mutex);
+
 out:
+	if (lo->lo_pending == 0)
+		up(&lo->lo_bh_mutex);
+	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
-inactive:
-	spin_unlock_irq(&lo->lo_lock);
-	goto out;
 }
 
 /*
@@ -560,13 +545,11 @@ static void do_loop_switch(struct loop_device *, struct switch_request *);
 
 static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
 {
-	int ret;
-
 	if (unlikely(!bio->bi_bdev)) {
 		do_loop_switch(lo, bio->bi_private);
 		bio_put(bio);
 	} else {
-		ret = do_bio_filebacked(lo, bio);
+		int ret = do_bio_filebacked(lo, bio);
 		bio_endio(bio, bio->bi_size, ret);
 	}
 }
@@ -594,7 +577,7 @@ static int loop_thread(void *data)
 	set_user_nice(current, -20);
 
 	lo->lo_state = Lo_bound;
-	atomic_inc(&lo->lo_pending);
+	lo->lo_pending = 1;
 
 	/*
 	 * up sem, we are running
@@ -602,26 +585,37 @@ static int loop_thread(void *data)
 	up(&lo->lo_sem);
 
 	for (;;) {
-		down_interruptible(&lo->lo_bh_mutex);
+		int pending;
+
 		/*
-		 * could be upped because of tear-down, not because of
-		 * pending work
+		 * interruptible just to not contribute to load avg
 		 */
-		if (!atomic_read(&lo->lo_pending))
+		if (down_interruptible(&lo->lo_bh_mutex))
+			continue;
+
+		spin_lock_irq(&lo->lo_lock);
+
+		/*
+		 * could be upped because of tear-down, not pending work
+		 */
+		if (unlikely(!lo->lo_pending)) {
+			spin_unlock_irq(&lo->lo_lock);
 			break;
+		}
 
 		bio = loop_get_bio(lo);
-		if (!bio) {
-			printk("loop: missing bio\n");
-			continue;
-		}
+		lo->lo_pending--;
+		pending = lo->lo_pending;
+		spin_unlock_irq(&lo->lo_lock);
+
+		BUG_ON(!bio);
 		loop_handle_bio(lo, bio);
 
 		/*
 		 * upped both for pending work and tear-down, lo_pending
 		 * will hit zero then
 		 */
-		if (atomic_dec_and_test(&lo->lo_pending))
+		if (unlikely(!pending))
 			break;
 	}
 
@@ -900,7 +894,8 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
 	spin_lock_irq(&lo->lo_lock);
 	lo->lo_state = Lo_rundown;
-	if (atomic_dec_and_test(&lo->lo_pending))
+	lo->lo_pending--;
+	if (!lo->lo_pending)
 		up(&lo->lo_bh_mutex);
 	spin_unlock_irq(&lo->lo_lock);
 
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 8220d9c9da00..53fa51595443 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -61,7 +61,7 @@ struct loop_device {
 	struct semaphore	lo_sem;
 	struct semaphore	lo_ctl_mutex;
 	struct semaphore	lo_bh_mutex;
-	atomic_t		lo_pending;
+	int			lo_pending;
 
 	request_queue_t		*lo_queue;
 };
-- 
cgit v1.2.3-59-g8ed1b


From ac20427ef6aa63da663bdc88b71d16f7394f5e23 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@redhat.com>
Date: Thu, 23 Jun 2005 00:09:11 -0700
Subject: [PATCH] add check to /proc/devices read routines

Patch to add check to get_chrdev_list and get_blkdev_list to prevent reads
of /proc/devices from spilling over the provided page if more than 4096
bytes of string data are generated from all the registered character and
block devices in a system

Signed-off-by: Neil Horman <nhorman@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/genhd.c | 12 ++++++++++--
 fs/char_dev.c         | 13 ++++++++++++-
 fs/proc/proc_misc.c   |  2 +-
 include/linux/genhd.h |  2 +-
 4 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/genhd.c b/drivers/block/genhd.c
index 43805e4d31e9..47fd3659a061 100644
--- a/drivers/block/genhd.c
+++ b/drivers/block/genhd.c
@@ -40,7 +40,7 @@ static inline int major_to_index(int major)
 
 #ifdef CONFIG_PROC_FS
 /* get block device names in somewhat random order */
-int get_blkdev_list(char *p)
+int get_blkdev_list(char *p, int used)
 {
 	struct blk_major_name *n;
 	int i, len;
@@ -49,10 +49,18 @@ int get_blkdev_list(char *p)
 
 	down(&block_subsys_sem);
 	for (i = 0; i < ARRAY_SIZE(major_names); i++) {
-		for (n = major_names[i]; n; n = n->next)
+		for (n = major_names[i]; n; n = n->next) {
+			/*
+			 * If the curent string plus the 5 extra characters
+			 * in the line would run us off the page, then we're done
+			 */
+			if ((len + used + strlen(n->name) + 5) >= PAGE_SIZE)
+				goto page_full;
 			len += sprintf(p+len, "%3d %s\n",
 				       n->major, n->name);
+		}
 	}
+page_full:
 	up(&block_subsys_sem);
 
 	return len;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c1e3537909fc..e82aac9cc2f5 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -56,10 +56,21 @@ int get_chrdev_list(char *page)
 
 	down(&chrdevs_lock);
 	for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) {
-		for (cd = chrdevs[i]; cd; cd = cd->next)
+		for (cd = chrdevs[i]; cd; cd = cd->next) {
+			/*
+			 * if the current name, plus the 5 extra characters
+			 * in the device line for this entry
+			 * would run us off the page, we're done
+			 */
+			if ((len+strlen(cd->name) + 5) >= PAGE_SIZE)
+				goto page_full;
+
+
 			len += sprintf(page+len, "%3d %s\n",
 				       cd->major, cd->name);
+		}
 	}
+page_full:
 	up(&chrdevs_lock);
 
 	return len;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 63a9fbf1ac51..94b570ad037d 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -451,7 +451,7 @@ static int devices_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
 	int len = get_chrdev_list(page);
-	len += get_blkdev_list(page+len);
+	len += get_blkdev_list(page+len, len);
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index af26dc718ef6..01796c41c951 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -224,7 +224,7 @@ static inline void free_disk_stats(struct gendisk *disk)
 extern void disk_round_stats(struct gendisk *disk);
 
 /* drivers/block/genhd.c */
-extern int get_blkdev_list(char *);
+extern int get_blkdev_list(char *, int);
 extern void add_disk(struct gendisk *disk);
 extern void del_gendisk(struct gendisk *gp);
 extern void unlink_gendisk(struct gendisk *gp);
-- 
cgit v1.2.3-59-g8ed1b


From 84de856ed30c568c2bb7b9ac0679772bd2737d9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:09:16 -0700
Subject: [PATCH] quota: consolidate code surrounding vfs_quota_on_mount

Move some code duplicated in both callers into vfs_quota_on_mount

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Jan Kara <jack@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dquot.c               | 23 +++++++++++++++++++----
 fs/ext3/super.c          | 18 ++----------------
 fs/reiserfs/super.c      | 21 +++------------------
 include/linux/quotaops.h |  3 ++-
 4 files changed, 26 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dquot.c b/fs/dquot.c
index 3995ce7907cc..343c03655619 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1519,14 +1519,29 @@ out_path:
  * This function is used when filesystem needs to initialize quotas
  * during mount time.
  */
-int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry)
+int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type)
 {
+	struct qstr name = {.name = qf_name, .len = 0, .len = strlen(qf_name)};
+	struct dentry *dentry;
 	int error;
 
+	dentry = lookup_hash(&name, sb->s_root);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
 	error = security_quota_on(dentry);
-	if (error)
-		return error;
-	return vfs_quota_on_inode(dentry->d_inode, type, format_id);
+	if (!error)
+		error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+
+	/*
+	 * Now invalidate and put the dentry - quota got its own reference
+	 * to inode and dentry has at least wrong hash so we had better
+	 * throw it away.
+	 */
+	d_invalidate(dentry);
+	dput(dentry);
+	return error;
 }
 
 /* Generic routine for getting common part of quota structure */
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 981ccb233ef5..9630fbfdc24a 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2348,22 +2348,8 @@ static int ext3_write_info(struct super_block *sb, int type)
  */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-	int err;
-	struct dentry *dentry;
-	struct qstr name = { .name = EXT3_SB(sb)->s_qf_names[type],
-			     .hash = 0,
-			     .len = strlen(EXT3_SB(sb)->s_qf_names[type])};
-
-	dentry = lookup_hash(&name, sb->s_root);
-	if (IS_ERR(dentry))
-		return PTR_ERR(dentry);
-	err = vfs_quota_on_mount(type, EXT3_SB(sb)->s_jquota_fmt, dentry);
-	/* Now invalidate and put the dentry - quota got its own reference
-	 * to inode and dentry has at least wrong hash so we had better
-	 * throw it away */
-	d_invalidate(dentry);
-	dput(dentry);
-	return err;
+	return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+			EXT3_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index b35b87744983..aae0779ed5b4 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1932,27 +1932,12 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 }
 
 /*
- * Turn on quotas during mount time - we need to find
- * the quota file and such...
+ * Turn on quotas during mount time - we need to find the quota file and such...
  */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-    int err;
-    struct dentry *dentry;
-    struct qstr name = { .name = REISERFS_SB(sb)->s_qf_names[type],
-                         .hash = 0,
-                         .len = strlen(REISERFS_SB(sb)->s_qf_names[type])};
-
-    dentry = lookup_hash(&name, sb->s_root);
-    if (IS_ERR(dentry))
-            return PTR_ERR(dentry);
-    err = vfs_quota_on_mount(type, REISERFS_SB(sb)->s_jquota_fmt, dentry);
-    /* Now invalidate and put the dentry - quota got its own reference
-     * to inode and dentry has at least wrong hash so we had better
-     * throw it away */
-    d_invalidate(dentry);
-    dput(dentry);
-    return err;
+	return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+			REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index e57baa85e744..d211507ab246 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -39,7 +39,8 @@ extern int dquot_commit_info(struct super_block *sb, int type);
 extern int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 extern int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path);
-extern int vfs_quota_on_mount(int type, int format_id, struct dentry *dentry);
+extern int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+		int format_id, int type);
 extern int vfs_quota_off(struct super_block *sb, int type);
 #define vfs_quota_off_mount(sb, type) vfs_quota_off(sb, type)
 extern int vfs_quota_sync(struct super_block *sb, int type);
-- 
cgit v1.2.3-59-g8ed1b


From b94cce926b2b902b79380ccba370d6f9f2980de0 Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:19 -0700
Subject: [PATCH] kprobes: function-return probes

This patch adds function-return probes to kprobes for the i386
architecture.  This enables you to establish a handler to be run when a
function returns.

1. API

Two new functions are added to kprobes:

	int register_kretprobe(struct kretprobe *rp);
	void unregister_kretprobe(struct kretprobe *rp);

2. Registration and unregistration

2.1 Register

  To register a function-return probe, the user populates the following
  fields in a kretprobe object and calls register_kretprobe() with the
  kretprobe address as an argument:

  kp.addr - the function's address

  handler - this function is run after the ret instruction executes, but
  before control returns to the return address in the caller.

  maxactive - The maximum number of instances of the probed function that
  can be active concurrently.  For example, if the function is non-
  recursive and is called with a spinlock or mutex held, maxactive = 1
  should be enough.  If the function is non-recursive and can never
  relinquish the CPU (e.g., via a semaphore or preemption), NR_CPUS should
  be enough.  maxactive is used to determine how many kretprobe_instance
  objects to allocate for this particular probed function.  If maxactive <=
  0, it is set to a default value (if CONFIG_PREEMPT maxactive=max(10, 2 *
  NR_CPUS) else maxactive=NR_CPUS)

  For example:

    struct kretprobe rp;
    rp.kp.addr = /* entrypoint address */
    rp.handler = /*return probe handler */
    rp.maxactive = /* e.g., 1 or NR_CPUS or 0, see the above explanation */
    register_kretprobe(&rp);

  The following field may also be of interest:

  nmissed - Initialized to zero when the function-return probe is
  registered, and incremented every time the probed function is entered but
  there is no kretprobe_instance object available for establishing the
  function-return probe (i.e., because maxactive was set too low).

2.2 Unregister

  To unregiter a function-return probe, the user calls
  unregister_kretprobe() with the same kretprobe object as registered
  previously.  If a probed function is running when the return probe is
  unregistered, the function will return as expected, but the handler won't
  be run.

3. Limitations

3.1 This patch supports only the i386 architecture, but patches for
    x86_64 and ppc64 are anticipated soon.

3.2 Return probes operates by replacing the return address in the stack
    (or in a known register, such as the lr register for ppc).  This may
    cause __builtin_return_address(0), when invoked from the return-probed
    function, to return the address of the return-probes trampoline.

3.3 This implementation uses the "Multiprobes at an address" feature in
    2.6.12-rc3-mm3.

3.4 Due to a limitation in multi-probes, you cannot currently establish
    a return probe and a jprobe on the same function.  A patch to remove
    this limitation is being tested.

This feature is required by SystemTap (http://sourceware.org/systemtap),
and reflects ideas contributed by several SystemTap developers, including
Will Cohen and Ananth Mavinakayanahalli.

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Frederik Deweerdt <frederik.deweerdt@laposte.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c | 102 +++++++++++++++++++++-
 arch/i386/kernel/process.c |  15 ++++
 include/asm-i386/kprobes.h |   3 +
 include/linux/kprobes.h    |  90 ++++++++++++++++++-
 kernel/kprobes.c           | 213 +++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 415 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 59ff9b455069..048f754bbe23 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -23,6 +23,9 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 
 #include <linux/config.h>
@@ -91,6 +94,53 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->eip = (unsigned long)&p->ainsn.insn;
 }
 
+struct task_struct  *arch_get_kprobe_task(void *ptr)
+{
+	return ((struct thread_info *) (((unsigned long) ptr) &
+					(~(THREAD_SIZE -1))))->task;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+	unsigned long *sara = (unsigned long *)&regs->esp;
+	struct kretprobe_instance *ri;
+	static void *orig_ret_addr;
+
+	/*
+	 * Save the return address when the return probe hits
+	 * the first time, and use it to populate the (krprobe
+	 * instance)->ret_addr for subsequent return probes at
+	 * the same addrress since stack address would have
+	 * the kretprobe_trampoline by then.
+	 */
+	if (((void*) *sara) != kretprobe_trampoline)
+		orig_ret_addr = (void*) *sara;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->stack_addr = sara;
+		ri->ret_addr = orig_ret_addr;
+		add_rp_inst(ri);
+		/* Replace the return addr with trampoline addr */
+		*sara = (unsigned long) &kretprobe_trampoline;
+	} else {
+		rp->nmissed++;
+	}
+}
+
+void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock)
+{
+	unsigned long flags = 0;
+	struct kretprobe_instance *ri;
+	spin_lock_irqsave(kp_lock, flags);
+	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
+		*((unsigned long *)(ri->stack_addr)) =
+					(unsigned long) ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	spin_unlock_irqrestore(kp_lock, flags);
+}
+
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
@@ -183,6 +233,55 @@ no_kprobe:
 	return ret;
 }
 
+/*
+ * For function-return probes, init_kprobes() establishes a probepoint
+ * here. When a retprobed function returns, this probe is hit and
+ * trampoline_probe_handler() runs, calling the kretprobe's handler.
+ */
+ void kretprobe_trampoline_holder(void)
+ {
+ 	asm volatile (  ".global kretprobe_trampoline\n"
+ 			"kretprobe_trampoline: \n"
+ 			"nop\n");
+ }
+
+/*
+ * Called when we hit the probe point at kretprobe_trampoline
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned long *sara = ((unsigned long *) &regs->esp) - 1;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = kretprobe_inst_table_head(tsk);
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara && ri->rp) {
+			if (ri->rp->handler)
+				ri->rp->handler(ri, regs);
+		}
+	}
+	return 0;
+}
+
+void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+						unsigned long flags)
+{
+	struct kretprobe_instance *ri;
+	/* RA already popped */
+	unsigned long *sara = ((unsigned long *)&regs->esp) - 1;
+
+	while ((ri = get_rp_inst(sara))) {
+		regs->eip = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+	}
+	regs->eflags &= ~TF_MASK;
+}
+
 /*
  * Called after single-stepping.  p->addr is the address of the
  * instruction whose first byte has been replaced by the "int 3"
@@ -266,7 +365,8 @@ static inline int post_kprobe_handler(struct pt_regs *regs)
 	if (current_kprobe->post_handler)
 		current_kprobe->post_handler(current_kprobe, regs, 0);
 
-	resume_execution(current_kprobe, regs);
+	if (current_kprobe->post_handler != trampoline_post_handler)
+		resume_execution(current_kprobe, regs);
 	regs->eflags |= kprobe_saved_eflags;
 
 	unlock_kprobes();
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index be3efba7caf7..aea2ce1145df 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -339,6 +340,13 @@ void exit_thread(void)
 	struct task_struct *tsk = current;
 	struct thread_struct *t = &tsk->thread;
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
 		int cpu = get_cpu();
@@ -362,6 +370,13 @@ void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(tsk);
+
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
 	/*
diff --git a/include/asm-i386/kprobes.h b/include/asm-i386/kprobes.h
index 4092f68d123a..8b6d3a90cd78 100644
--- a/include/asm-i386/kprobes.h
+++ b/include/asm-i386/kprobes.h
@@ -39,6 +39,9 @@ typedef u8 kprobe_opcode_t;
 	: (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)pentry
+#define ARCH_SUPPORTS_KRETPROBES
+
+void kretprobe_trampoline(void);
 
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 99ddba5a4e00..fba39f87efec 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -25,21 +25,31 @@
  *		Rusty Russell).
  * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  *		interface to access function arguments.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com> and Jim Keniston
+ *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/config.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/spinlock.h>
+
 #include <asm/kprobes.h>
 
 struct kprobe;
 struct pt_regs;
+struct kretprobe;
+struct kretprobe_instance;
 typedef int (*kprobe_pre_handler_t) (struct kprobe *, struct pt_regs *);
 typedef int (*kprobe_break_handler_t) (struct kprobe *, struct pt_regs *);
 typedef void (*kprobe_post_handler_t) (struct kprobe *, struct pt_regs *,
 				       unsigned long flags);
 typedef int (*kprobe_fault_handler_t) (struct kprobe *, struct pt_regs *,
 				       int trapnr);
+typedef int (*kretprobe_handler_t) (struct kretprobe_instance *,
+				    struct pt_regs *);
+
 struct kprobe {
 	struct hlist_node hlist;
 
@@ -85,6 +95,62 @@ struct jprobe {
 	kprobe_opcode_t *entry;	/* probe handling code to jump to */
 };
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs);
+extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
+							unsigned long flags);
+extern struct task_struct *arch_get_kprobe_task(void *ptr);
+extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
+extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+#else /* ARCH_SUPPORTS_KRETPROBES */
+static inline void kretprobe_trampoline(void)
+{
+}
+static inline int trampoline_probe_handler(struct kprobe *p,
+						struct pt_regs *regs)
+{
+	return 0;
+}
+static inline void trampoline_post_handler(struct kprobe *p,
+				struct pt_regs *regs, unsigned long flags)
+{
+}
+static inline void arch_prepare_kretprobe(struct kretprobe *rp,
+					struct pt_regs *regs)
+{
+}
+static inline void arch_kprobe_flush_task(struct task_struct *tk)
+{
+}
+#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL)
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+/*
+ * Function-return probe -
+ * Note:
+ * User needs to provide a handler function, and initialize maxactive.
+ * maxactive - The maximum number of instances of the probed function that
+ * can be active concurrently.
+ * nmissed - tracks the number of times the probed function's return was
+ * ignored, due to maxactive being too low.
+ *
+ */
+struct kretprobe {
+	struct kprobe kp;
+	kretprobe_handler_t handler;
+	int maxactive;
+	int nmissed;
+	struct hlist_head free_instances;
+	struct hlist_head used_instances;
+};
+
+struct kretprobe_instance {
+	struct hlist_node uflist; /* either on free list or used list */
+	struct hlist_node hlist;
+	struct kretprobe *rp;
+	void *ret_addr;
+	void *stack_addr;
+};
+
 #ifdef CONFIG_KPROBES
 /* Locks kprobe: irq must be disabled */
 void lock_kprobes(void);
@@ -104,6 +170,7 @@ extern void show_registers(struct pt_regs *regs);
 
 /* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
 struct kprobe *get_kprobe(void *addr);
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
@@ -113,7 +180,16 @@ int register_jprobe(struct jprobe *p);
 void unregister_jprobe(struct jprobe *p);
 void jprobe_return(void);
 
-#else
+int register_kretprobe(struct kretprobe *rp);
+void unregister_kretprobe(struct kretprobe *rp);
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
+struct kretprobe_instance *get_rp_inst(void *sara);
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk);
+void add_rp_inst(struct kretprobe_instance *ri);
+void kprobe_flush_task(struct task_struct *tk);
+void recycle_rp_inst(struct kretprobe_instance *ri);
+#else /* CONFIG_KPROBES */
 static inline int kprobe_running(void)
 {
 	return 0;
@@ -135,5 +211,15 @@ static inline void unregister_jprobe(struct jprobe *p)
 static inline void jprobe_return(void)
 {
 }
-#endif
+static inline int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kretprobe(struct kretprobe *rp)
+{
+}
+static inline void kprobe_flush_task(struct task_struct *tk)
+{
+}
+#endif				/* CONFIG_KPROBES */
 #endif				/* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..692fbf75ab49 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,6 +27,9 @@
  *		interface to access function arguments.
  * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
  *		exceptions notifier to be first on the priority list.
+ * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/kprobes.h>
 #include <linux/spinlock.h>
@@ -41,6 +44,7 @@
 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
 
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
@@ -78,7 +82,7 @@ struct kprobe *get_kprobe(void *addr)
  * Aggregate handlers for multiple kprobes support - these handlers
  * take care of invoking the individual kprobe handlers on p->list
  */
-int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
@@ -92,8 +96,8 @@ int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
-		unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+			      unsigned long flags)
 {
 	struct kprobe *kp;
 
@@ -107,7 +111,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	return;
 }
 
-int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+			      int trapnr)
 {
 	/*
 	 * if we faulted "during" the execution of a user specified
@@ -120,6 +125,135 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
 	return 0;
 }
 
+struct kprobe trampoline_p = {
+		.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+		.pre_handler = trampoline_probe_handler,
+		.post_handler = trampoline_post_handler
+};
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+{
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+	hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
+		return ri;
+	return NULL;
+}
+
+struct kretprobe_instance *get_rp_inst(void *sara)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct task_struct *tsk;
+	struct kretprobe_instance *ri;
+
+	tsk = arch_get_kprobe_task(sara);
+	head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+	hlist_for_each_entry(ri, node, head, hlist) {
+		if (ri->stack_addr == sara)
+			return ri;
+	}
+	return NULL;
+}
+
+void add_rp_inst(struct kretprobe_instance *ri)
+{
+	struct task_struct *tsk;
+	/*
+	 * Remove rp inst off the free list -
+	 * Add it back when probed function returns
+	 */
+	hlist_del(&ri->uflist);
+	tsk = arch_get_kprobe_task(ri->stack_addr);
+	/* Add rp inst onto table */
+	INIT_HLIST_NODE(&ri->hlist);
+	hlist_add_head(&ri->hlist,
+			&kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
+
+	/* Also add this rp inst to the used list. */
+	INIT_HLIST_NODE(&ri->uflist);
+	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+}
+
+void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+	/* remove rp inst off the rprobe_inst_table */
+	hlist_del(&ri->hlist);
+	if (ri->rp) {
+		/* remove rp inst off the used list */
+		hlist_del(&ri->uflist);
+		/* put rp inst back onto the free list */
+		INIT_HLIST_NODE(&ri->uflist);
+		hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+	} else
+		/* Unregistering */
+		kfree(ri);
+}
+
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+{
+	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
+
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
+{
+	struct task_struct *tsk;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct kretprobe_instance *ri;
+
+	head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
+
+	hlist_for_each_entry(ri, node, head, hlist) {
+		tsk = arch_get_kprobe_task(ri->stack_addr);
+		if (tsk == tk)
+			return ri;
+	}
+	return NULL;
+}
+
+/*
+ * This function is called from do_exit or do_execv when task tk's stack is
+ * about to be recycled. Recycle any function-return probe instances
+ * associated with this task. These represent probed functions that have
+ * been called but may never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+	arch_kprobe_flush_task(tk, &kprobe_lock);
+}
+
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+
+	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	arch_prepare_kretprobe(rp, regs);
+	return 0;
+}
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+	struct kretprobe_instance *ri;
+	while ((ri = get_free_rp_inst(rp)) != NULL) {
+		hlist_del(&ri->uflist);
+		kfree(ri);
+	}
+}
+
 /*
  * Fill in the required fields of the "manager kprobe". Replace the
  * earlier kprobe in the hlist with the manager kprobe
@@ -257,16 +391,82 @@ void unregister_jprobe(struct jprobe *jp)
 	unregister_kprobe(&jp->kp);
 }
 
+#ifdef ARCH_SUPPORTS_KRETPROBES
+
+int register_kretprobe(struct kretprobe *rp)
+{
+	int ret = 0;
+	struct kretprobe_instance *inst;
+	int i;
+
+	rp->kp.pre_handler = pre_handler_kretprobe;
+
+	/* Pre-allocate memory for max kretprobe instances */
+	if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+		rp->maxactive = max(10, 2 * NR_CPUS);
+#else
+		rp->maxactive = NR_CPUS;
+#endif
+	}
+	INIT_HLIST_HEAD(&rp->used_instances);
+	INIT_HLIST_HEAD(&rp->free_instances);
+	for (i = 0; i < rp->maxactive; i++) {
+		inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+		if (inst == NULL) {
+			free_rp_inst(rp);
+			return -ENOMEM;
+		}
+		INIT_HLIST_NODE(&inst->uflist);
+		hlist_add_head(&inst->uflist, &rp->free_instances);
+	}
+
+	rp->nmissed = 0;
+	/* Establish function entry probe point */
+	if ((ret = register_kprobe(&rp->kp)) != 0)
+		free_rp_inst(rp);
+	return ret;
+}
+
+#else /* ARCH_SUPPORTS_KRETPROBES */
+
+int register_kretprobe(struct kretprobe *rp)
+{
+	return -ENOSYS;
+}
+
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+
+void unregister_kretprobe(struct kretprobe *rp)
+{
+	unsigned long flags;
+	struct kretprobe_instance *ri;
+
+	unregister_kprobe(&rp->kp);
+	/* No race here */
+	spin_lock_irqsave(&kprobe_lock, flags);
+	free_rp_inst(rp);
+	while ((ri = get_used_rp_inst(rp)) != NULL) {
+		ri->rp = NULL;
+		hlist_del(&ri->uflist);
+	}
+	spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
 static int __init init_kprobes(void)
 {
 	int i, err = 0;
 
 	/* FIXME allocate the probe table, currently defined statically */
 	/* initialize all list heads */
-	for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		INIT_HLIST_HEAD(&kprobe_table[i]);
+		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+	}
 
 	err = register_die_notifier(&kprobe_exceptions_nb);
+	/* Register the trampoline probe for return probe */
+	register_kprobe(&trampoline_p);
 	return err;
 }
 
@@ -277,3 +477,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
 EXPORT_SYMBOL_GPL(register_jprobe);
 EXPORT_SYMBOL_GPL(unregister_jprobe);
 EXPORT_SYMBOL_GPL(jprobe_return);
+EXPORT_SYMBOL_GPL(register_kretprobe);
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+
-- 
cgit v1.2.3-59-g8ed1b


From 7e1048b11c5afe79aac46a42e3ccec86b8365c6d Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Thu, 23 Jun 2005 00:09:25 -0700
Subject: [PATCH] Move kprobe [dis]arming into arch specific code

The architecture independent code of the current kprobes implementation is
arming and disarming kprobes at registration time.  The problem is that the
code is assuming that arming and disarming is a just done by a simple write
of some magic value to an address.  This is problematic for ia64 where our
instructions look more like structures, and we can not insert break points
by just doing something like:

*p->addr = BREAKPOINT_INSTRUCTION;

The following patch to 2.6.12-rc4-mm2 adds two new architecture dependent
functions:

     * void arch_arm_kprobe(struct kprobe *p)
     * void arch_disarm_kprobe(struct kprobe *p)

and then adds the new functions for each of the architectures that already
implement kprobes (spar64/ppc64/i386/x86_64).

I thought arch_[dis]arm_kprobe was the most descriptive of what was really
happening, but each of the architectures already had a disarm_kprobe()
function that was really a "disarm and do some other clean-up items as
needed when you stumble across a recursive kprobe." So...  I took the
liberty of changing the code that was calling disarm_kprobe() to call
arch_disarm_kprobe(), and then do the cleanup in the block of code dealing
with the recursive kprobe case.

So far this patch as been tested on i386, x86_64, and ppc64, but still
needs to be tested in sparc64.

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    | 19 +++++++++++++++----
 arch/ppc64/kernel/kprobes.c   | 19 +++++++++++++++----
 arch/sparc64/kernel/kprobes.c | 31 ++++++++++++++++++-------------
 arch/x86_64/kernel/kprobes.c  | 26 ++++++++++++++++++--------
 include/linux/kprobes.h       |  2 ++
 kernel/kprobes.c              | 12 ++++--------
 6 files changed, 72 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 048f754bbe23..2314d8d306fd 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -33,6 +33,7 @@
 #include <linux/ptrace.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <asm/cacheflush.h>
 #include <asm/kdebug.h>
 #include <asm/desc.h>
 
@@ -71,16 +72,25 @@ int arch_prepare_kprobe(struct kprobe *p)
 void arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->eip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -177,7 +187,8 @@ static int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->eip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index e950a2058a19..8c0920a6d03e 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -32,6 +32,7 @@
 #include <linux/ptrace.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <asm/cacheflush.h>
 #include <asm/kdebug.h>
 #include <asm/sstep.h>
 
@@ -61,16 +62,25 @@ int arch_prepare_kprobe(struct kprobe *p)
 void arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->nip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -101,7 +111,8 @@ static inline int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->nip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index 7066d7ba667a..d67195ba3fa2 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -6,7 +6,6 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/kprobes.h>
-
 #include <asm/kdebug.h>
 #include <asm/signal.h>
 
@@ -47,6 +46,19 @@ void arch_copy_kprobe(struct kprobe *p)
 {
 	p->ainsn.insn[0] = *p->addr;
 	p->ainsn.insn[1] = BREAKPOINT_INSTRUCTION_2;
+	p->opcode = *p->addr;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flushi(p->addr);
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+	*p->addr = p->opcode;
+	flushi(p->addr);
 }
 
 void arch_remove_kprobe(struct kprobe *p)
@@ -78,17 +90,6 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	}
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
-{
-	*p->addr = p->opcode;
-	flushi(p->addr);
-
-	regs->tpc = (unsigned long) p->addr;
-	regs->tnpc = current_kprobe_orig_tnpc;
-	regs->tstate = ((regs->tstate & ~TSTATE_PIL) |
-			current_kprobe_orig_tstate_pil);
-}
-
 static int kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
@@ -109,7 +110,11 @@ static int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->tpc = (unsigned long) p->addr;
+			regs->tnpc = current_kprobe_orig_tnpc;
+			regs->tstate = ((regs->tstate & ~TSTATE_PIL) |
+					current_kprobe_orig_tstate_pil);
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 203672ca7401..324bf57925a9 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -39,7 +39,7 @@
 #include <linux/slab.h>
 #include <linux/preempt.h>
 #include <linux/moduleloader.h>
-
+#include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
 
@@ -216,19 +216,28 @@ void arch_copy_kprobe(struct kprobe *p)
 		BUG_ON((s64) (s32) disp != disp); /* Sanity check.  */
 		*ripdisp = disp;
 	}
+	p->opcode = *p->addr;
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
 {
-	up(&kprobe_mutex);
-	free_insn_slot(p->ainsn.insn);
-	down(&kprobe_mutex);
+	*p->addr = BREAKPOINT_INSTRUCTION;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs)
+void arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
-	regs->rip = (unsigned long)p->addr;
+	flush_icache_range((unsigned long) p->addr,
+			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+	up(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	down(&kprobe_mutex);
 }
 
 static void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -311,7 +320,8 @@ int kprobe_handler(struct pt_regs *regs)
 				unlock_kprobes();
 				goto no_kprobe;
 			}
-			disarm_kprobe(p, regs);
+			arch_disarm_kprobe(p);
+			regs->rip = (unsigned long)p->addr;
 			ret = 1;
 		} else {
 			p = current_kprobe;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index fba39f87efec..0f90466fb8b0 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -165,6 +165,8 @@ static inline int kprobe_running(void)
 
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
+extern void arch_arm_kprobe(struct kprobe *p);
+extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
 extern void show_registers(struct pt_regs *regs);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 692fbf75ab49..e8e0ae8a6e14 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -261,7 +261,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
 static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	ap->addr = p->addr;
-	ap->opcode = p->opcode;
+	memcpy(&ap->opcode, &p->opcode, sizeof(kprobe_opcode_t));
 	memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
 
 	ap->pre_handler = aggr_pre_handler;
@@ -304,10 +304,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
 /* kprobe removal house-keeping routines */
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
-	*p->addr = p->opcode;
+	arch_disarm_kprobe(p);
 	hlist_del(&p->hlist);
-	flush_icache_range((unsigned long) p->addr,
-		   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 	arch_remove_kprobe(p);
 }
@@ -344,10 +342,8 @@ int register_kprobe(struct kprobe *p)
 	hlist_add_head(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	p->opcode = *p->addr;
-	*p->addr = BREAKPOINT_INSTRUCTION;
-	flush_icache_range((unsigned long) p->addr,
-			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+  	arch_arm_kprobe(p);
+
 out:
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 rm_kprobe:
-- 
cgit v1.2.3-59-g8ed1b


From 0aa55e4d7db822059fe8132fe9f2b7773c48216c Mon Sep 17 00:00:00 2001
From: Hien Nguyen <hien@us.ibm.com>
Date: Thu, 23 Jun 2005 00:09:26 -0700
Subject: [PATCH] kprobes: moves lock-unlock to non-arch kprobe_flush_task

This patch moves the lock/unlock of the arch specific kprobe_flush_task()
to the non-arch specific kprobe_flusk_task().

Signed-off-by: Hien Nguyen <hien@us.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c | 5 +----
 include/linux/kprobes.h    | 3 +--
 kernel/kprobes.c           | 5 ++++-
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 2314d8d306fd..b8e2bae0ab4f 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -138,17 +138,14 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
 	}
 }
 
-void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock)
+void arch_kprobe_flush_task(struct task_struct *tk)
 {
-	unsigned long flags = 0;
 	struct kretprobe_instance *ri;
-	spin_lock_irqsave(kp_lock, flags);
 	while ((ri = get_rp_inst_tsk(tk)) != NULL) {
 		*((unsigned long *)(ri->stack_addr)) =
 					(unsigned long) ri->ret_addr;
 		recycle_rp_inst(ri);
 	}
-	spin_unlock_irqrestore(kp_lock, flags);
 }
 
 /*
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 0f90466fb8b0..461391decc46 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,7 +33,6 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
-#include <linux/spinlock.h>
 
 #include <asm/kprobes.h>
 
@@ -101,7 +100,7 @@ extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
 							unsigned long flags);
 extern struct task_struct *arch_get_kprobe_task(void *ptr);
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
-extern void arch_kprobe_flush_task(struct task_struct *tk, spinlock_t *kp_lock);
+extern void arch_kprobe_flush_task(struct task_struct *tk);
 #else /* ARCH_SUPPORTS_KRETPROBES */
 static inline void kretprobe_trampoline(void)
 {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e8e0ae8a6e14..dd42e717dd35 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -229,7 +229,10 @@ struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
  */
 void kprobe_flush_task(struct task_struct *tk)
 {
-	arch_kprobe_flush_task(tk, &kprobe_lock);
+	unsigned long flags = 0;
+	spin_lock_irqsave(&kprobe_lock, flags);
+	arch_kprobe_flush_task(tk);
+	spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From ea32c65cc2d2294c04e9f81d0578a6f51febfdbf Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Thu, 23 Jun 2005 00:09:36 -0700
Subject: [PATCH] kprobes: Temporary disarming of reentrant probe

In situations where a kprobes handler calls a routine which has a probe on it,
then kprobes_handler() disarms the new probe forever.  This patch removes the
above limitation by temporarily disarming the new probe.  When the another
probe hits while handling the old probe, the kprobes_handler() saves previous
kprobes state and handles the new probe without calling the new kprobes
registered handlers.  kprobe_post_handler() restores back the previous kprobes
state and the normal execution continues.

However on x86_64 architecture, re-rentrancy is provided only through
pre_handler().  If a routine having probe is referenced through
post_handler(), then the probes on that routine are disarmed forever, since
the exception stack is gets changed after the processor single steps the
instruction of the new probe.

This patch includes generic changes to support temporary disarming on
reentrancy of probes.

Signed-of-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 9 +++++++++
 kernel/kprobes.c        | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 461391decc46..5e1a7b0d7b3f 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -36,6 +36,12 @@
 
 #include <asm/kprobes.h>
 
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+#define KPROBE_REENTER		0x00000004
+#define KPROBE_HIT_SSDONE	0x00000008
+
 struct kprobe;
 struct pt_regs;
 struct kretprobe;
@@ -55,6 +61,9 @@ struct kprobe {
 	/* list of kprobes for multi-handler support */
 	struct list_head list;
 
+	/*count the number of times this probe was temporarily disarmed */
+	unsigned long nmissed;
+
 	/* location of the probe point */
 	kprobe_opcode_t *addr;
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dd42e717dd35..456ecedff2d4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -335,6 +335,7 @@ int register_kprobe(struct kprobe *p)
 	}
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
+	p->nmissed = 0;
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
-- 
cgit v1.2.3-59-g8ed1b


From d6e711448137ca3301512cec41a2c2ce852b3d0a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 23 Jun 2005 00:09:43 -0700
Subject: [PATCH] setuid core dump

Add a new `suid_dumpable' sysctl:

This value can be used to query and set the core dump mode for setuid
or otherwise protected/tainted binaries. The modes are

0 - (default) - traditional behaviour.  Any process which has changed
    privilege levels or is execute only will not be dumped

1 - (debug) - all processes dump core when possible.  The core dump is
    owned by the current user and no security is applied.  This is intended
    for system debugging situations only.  Ptrace is unchecked.

2 - (suidsafe) - any binary which normally would not be dumped is dumped
    readable by root only.  This allows the end user to remove such a dump but
    not access it directly.  For security reasons core dumps in this mode will
    not overwrite one another or other files.  This mode is appropriate when
    adminstrators are attempting to debug problems in a normal environment.

(akpm:

> > +EXPORT_SYMBOL(suid_dumpable);
>
> EXPORT_SYMBOL_GPL?

No problem to me.

> >  	if (current->euid == current->uid && current->egid == current->gid)
> >  		current->mm->dumpable = 1;
>
> Should this be SUID_DUMP_USER?

Actually the feedback I had from last time was that the SUID_ defines
should go because its clearer to follow the numbers. They can go
everywhere (and there are lots of places where dumpable is tested/used
as a bool in untouched code)

> Maybe this should be renamed to `dump_policy' or something.  Doing that
> would help us catch any code which isn't using the #defines, too.

Fair comment. The patch was designed to be easy to maintain for Red Hat
rather than for merging. Changing that field would create a gigantic
diff because it is used all over the place.

)

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/kernel.txt | 20 ++++++++++++++++++++
 fs/exec.c                       | 23 +++++++++++++++++++++--
 fs/proc/base.c                  |  6 ++++--
 include/linux/binfmts.h         |  5 +++++
 include/linux/sched.h           |  2 +-
 include/linux/sysctl.h          |  1 +
 kernel/sys.c                    | 22 +++++++++++-----------
 kernel/sysctl.c                 |  9 +++++++++
 security/commoncap.c            |  2 +-
 security/dummy.c                |  2 +-
 10 files changed, 74 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 35159176997b..9f11d36a8c10 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -49,6 +49,7 @@ show up in /proc/sys/kernel:
 - shmmax                      [ sysv ipc ]
 - shmmni
 - stop-a                      [ SPARC only ]
+- suid_dumpable
 - sysrq                       ==> Documentation/sysrq.txt
 - tainted
 - threads-max
@@ -300,6 +301,25 @@ kernel.  This value defaults to SHMMAX.
 
 ==============================================================
 
+suid_dumpable:
+
+This value can be used to query and set the core dump mode for setuid
+or otherwise protected/tainted binaries. The modes are
+
+0 - (default) - traditional behaviour. Any process which has changed
+	privilege levels or is execute only will not be dumped
+1 - (debug) - all processes dump core when possible. The core dump is
+	owned by the current user and no security is applied. This is
+	intended for system debugging situations only. Ptrace is unchecked.
+2 - (suidsafe) - any binary which normally would not be dumped is dumped
+	readable by root only. This allows the end user to remove
+	such a dump but not access it directly. For security reasons
+	core dumps in this mode will not overwrite one another or
+	other files. This mode is appropriate when adminstrators are
+	attempting to debug problems in a normal environment.
+
+==============================================================
+
 tainted: 
 
 Non-zero if the kernel has been tainted.  Numeric values, which
diff --git a/fs/exec.c b/fs/exec.c
index 3a4b35a14c0d..48871917d363 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -58,6 +58,9 @@
 
 int core_uses_pid;
 char core_pattern[65] = "core";
+int suid_dumpable = 0;
+
+EXPORT_SYMBOL(suid_dumpable);
 /* The maximal length of core_pattern is also specified in sysctl.c */
 
 static struct linux_binfmt *formats;
@@ -864,6 +867,9 @@ int flush_old_exec(struct linux_binprm * bprm)
 
 	if (current->euid == current->uid && current->egid == current->gid)
 		current->mm->dumpable = 1;
+	else
+		current->mm->dumpable = suid_dumpable;
+
 	name = bprm->filename;
 
 	/* Copies the binary name from after last slash */
@@ -884,7 +890,7 @@ int flush_old_exec(struct linux_binprm * bprm)
 	    permission(bprm->file->f_dentry->d_inode,MAY_READ, NULL) ||
 	    (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) {
 		suid_keys(current);
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 	}
 
 	/* An exec changes our domain. We are no longer part of the thread
@@ -1432,6 +1438,8 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 	struct inode * inode;
 	struct file * file;
 	int retval = 0;
+	int fsuid = current->fsuid;
+	int flag = 0;
 
 	binfmt = current->binfmt;
 	if (!binfmt || !binfmt->core_dump)
@@ -1441,6 +1449,16 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 		up_write(&mm->mmap_sem);
 		goto fail;
 	}
+
+	/*
+	 *	We cannot trust fsuid as being the "true" uid of the
+	 *	process nor do we know its entire history. We only know it
+	 *	was tainted so we dump it as root in mode 2.
+	 */
+	if (mm->dumpable == 2) {	/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		current->fsuid = 0;	/* Dump root private */
+	}
 	mm->dumpable = 0;
 	init_completion(&mm->core_done);
 	spin_lock_irq(&current->sighand->siglock);
@@ -1466,7 +1484,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
  	lock_kernel();
 	format_corename(corename, core_pattern, signr);
 	unlock_kernel();
-	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE, 0600);
+	file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600);
 	if (IS_ERR(file))
 		goto fail_unlock;
 	inode = file->f_dentry->d_inode;
@@ -1491,6 +1509,7 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
 close_fail:
 	filp_close(file, NULL);
 fail_unlock:
+	current->fsuid = fsuid;
 	complete_all(&mm->core_done);
 fail:
 	return retval;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e31903aadd96..ace151fa4878 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -314,7 +314,7 @@ static int may_ptrace_attach(struct task_struct *task)
 	     (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
 		goto out;
 	rmb();
-	if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
+	if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
 		goto out;
 	if (security_ptrace(current, task))
 		goto out;
@@ -1113,7 +1113,9 @@ static int task_dumpable(struct task_struct *task)
 	if (mm)
 		dumpable = mm->dumpable;
 	task_unlock(task);
-	return dumpable;
+	if(dumpable == 1)
+		return 1;
+	return 0;
 }
 
 
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 7e736e201c46..c1e82c514443 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -69,6 +69,11 @@ extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
 
+extern int suid_dumpable;
+#define SUID_DUMP_DISABLE	0	/* No setuid dumping */
+#define SUID_DUMP_USER		1	/* Dump as user of process */
+#define SUID_DUMP_ROOT		2	/* Dump as root */
+
 /* Stack area protections */
 #define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
 #define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b58afd97a180..901742f92389 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -246,7 +246,7 @@ struct mm_struct {
 
 	unsigned long saved_auxv[42]; /* for /proc/PID/auxv */
 
-	unsigned dumpable:1;
+	unsigned dumpable:2;
 	cpumask_t cpu_vm_mask;
 
 	/* Architecture-specific MM context */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a17745c80a91..614e939c78a4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -136,6 +136,7 @@ enum
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
+	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 };
 
 
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..0a2c8cda9638 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -525,7 +525,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	}
 	if (new_egid != old_egid)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	if (rgid != (gid_t) -1 ||
@@ -556,7 +556,7 @@ asmlinkage long sys_setgid(gid_t gid)
 	{
 		if(old_egid != gid)
 		{
-			current->mm->dumpable=0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +565,7 @@ asmlinkage long sys_setgid(gid_t gid)
 	{
 		if(old_egid != gid)
 		{
-			current->mm->dumpable=0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->egid = current->fsgid = gid;
@@ -596,7 +596,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
 
 	if(dumpclear)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->uid = new_ruid;
@@ -653,7 +653,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 
 	if (new_euid != old_euid)
 	{
-		current->mm->dumpable=0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = new_euid;
@@ -703,7 +703,7 @@ asmlinkage long sys_setuid(uid_t uid)
 
 	if (old_euid != uid)
 	{
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 		smp_wmb();
 	}
 	current->fsuid = current->euid = uid;
@@ -748,7 +748,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 	if (euid != (uid_t) -1) {
 		if (euid != current->euid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->euid = euid;
@@ -798,7 +798,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 	if (egid != (gid_t) -1) {
 		if (egid != current->egid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->egid = egid;
@@ -845,7 +845,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 	{
 		if (uid != old_fsuid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->fsuid = uid;
@@ -875,7 +875,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
 	{
 		if (gid != old_fsgid)
 		{
-			current->mm->dumpable = 0;
+			current->mm->dumpable = suid_dumpable;
 			smp_wmb();
 		}
 		current->fsgid = gid;
@@ -1652,7 +1652,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 				error = 1;
 			break;
 		case PR_SET_DUMPABLE:
-			if (arg2 != 0 && arg2 != 1) {
+			if (arg2 < 0 || arg2 > 2) {
 				error = -EINVAL;
 				break;
 			}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..24a4d12d5aa9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
 extern int max_threads;
 extern int sysrq_enabled;
 extern int core_uses_pid;
+extern int suid_dumpable;
 extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= KERN_SETUID_DUMPABLE,
+		.procname	= "suid_dumpable",
+		.data		= &suid_dumpable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 849b8c338ee8..04c12f58d656 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -149,7 +149,7 @@ void cap_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid ||
 	    !cap_issubset (new_permitted, current->cap_permitted)) {
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 
 		if (unsafe & ~LSM_UNSAFE_PTRACE_CAP) {
 			if (!capable(CAP_SETUID)) {
diff --git a/security/dummy.c b/security/dummy.c
index b32eff146547..6ff887586479 100644
--- a/security/dummy.c
+++ b/security/dummy.c
@@ -130,7 +130,7 @@ static void dummy_bprm_free_security (struct linux_binprm *bprm)
 static void dummy_bprm_apply_creds (struct linux_binprm *bprm, int unsafe)
 {
 	if (bprm->e_uid != current->uid || bprm->e_gid != current->gid) {
-		current->mm->dumpable = 0;
+		current->mm->dumpable = suid_dumpable;
 
 		if ((unsafe & ~LSM_UNSAFE_PTRACE_CAP) && !capable(CAP_SETUID)) {
 			bprm->e_uid = current->uid;
-- 
cgit v1.2.3-59-g8ed1b


From ef3daeda7b58f046f94b26637d500354038d39f4 Mon Sep 17 00:00:00 2001
From: Yoav Zach <yoav_zach@yahoo.com>
Date: Thu, 23 Jun 2005 00:09:58 -0700
Subject: [PATCH] Don't force O_LARGEFILE for 32 bit processes on ia64

In ia64 kernel, the O_LARGEFILE flag is forced when opening a file.  This
is problematic for execution of 32 bit processes, which are not largefile
aware, either by SW emulation or by HW execution.

For such processes, the problem is two-fold:

1) When trying to open a file that is larger than 4G
   the operation should fail, but it's not
2) Writing to offset larger than 4G should fail, but
   it's not

The proposed patch takes advantage of the way 32 bit processes are
identified in ia64 systems.  Such processes have PER_LINUX32 for their
personality.  With the patch, the ia64 kernel will not enforce the
O_LARGEFILE flag if the current process has PER_LINUX32 set.  The behavior
for all other architectures remains unchanged.

Signed-off-by: Yoav Zach <yoav.zach@intel.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c                | 7 ++++---
 include/asm-ia64/fcntl.h | 2 ++
 include/linux/fcntl.h    | 4 ++++
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 963bd81a44c8..2ebb72c1a876 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -21,6 +21,7 @@
 #include <linux/vfs.h>
 #include <asm/uaccess.h>
 #include <linux/fs.h>
+#include <linux/personality.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 
@@ -935,9 +936,9 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
 	char * tmp;
 	int fd, error;
 
-#if BITS_PER_LONG != 32
-	flags |= O_LARGEFILE;
-#endif
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
 	tmp = getname(filename);
 	fd = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
diff --git a/include/asm-ia64/fcntl.h b/include/asm-ia64/fcntl.h
index d193981bb1d8..c9f8d835d0cc 100644
--- a/include/asm-ia64/fcntl.h
+++ b/include/asm-ia64/fcntl.h
@@ -81,4 +81,6 @@ struct flock {
 
 #define F_LINUX_SPECIFIC_BASE	1024
 
+#define force_o_largefile() ( ! (current->personality & PER_LINUX32) )
+
 #endif /* _ASM_IA64_FCNTL_H */
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index 704fb76b6334..8a7c82151de9 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -25,6 +25,10 @@
 
 #ifdef __KERNEL__
 
+#ifndef force_o_largefile
+#define force_o_largefile() (BITS_PER_LONG != 32)
+#endif
+
 #if BITS_PER_LONG == 32
 #define IS_GETLK32(cmd)		((cmd) == F_GETLK)
 #define IS_SETLK32(cmd)		((cmd) == F_SETLK)
-- 
cgit v1.2.3-59-g8ed1b


From 46c271bedd2c8444b1d05bc44928beec0c07debc Mon Sep 17 00:00:00 2001
From: Peter Osterlund <petero2@telia.com>
Date: Thu, 23 Jun 2005 00:10:02 -0700
Subject: [PATCH] Improve CD/DVD packet driver write performance

This patch improves write performance for the CD/DVD packet writing driver.
 The logic for switching between reading and writing has been changed so
that streaming writes are no longer interrupted by read requests.

Signed-off-by: Peter Osterlund <petero2@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/pktcdvd.c | 36 ++++++++++++++++++++----------------
 include/linux/pktcdvd.h |  2 +-
 2 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index bc56770bcc90..7f3d78de265c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -467,14 +467,12 @@ static int pkt_set_speed(struct pktcdvd_device *pd, unsigned write_speed, unsign
  * Queue a bio for processing by the low-level CD device. Must be called
  * from process context.
  */
-static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_prio_read)
+static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
 {
 	spin_lock(&pd->iosched.lock);
 	if (bio_data_dir(bio) == READ) {
 		pkt_add_list_last(bio, &pd->iosched.read_queue,
 				  &pd->iosched.read_queue_tail);
-		if (high_prio_read)
-			pd->iosched.high_prio_read = 1;
 	} else {
 		pkt_add_list_last(bio, &pd->iosched.write_queue,
 				  &pd->iosched.write_queue_tail);
@@ -490,15 +488,16 @@ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio, int high_p
  * requirements for CDRW drives:
  * - A cache flush command must be inserted before a read request if the
  *   previous request was a write.
- * - Switching between reading and writing is slow, so don't it more often
+ * - Switching between reading and writing is slow, so don't do it more often
  *   than necessary.
+ * - Optimize for throughput at the expense of latency. This means that streaming
+ *   writes will never be interrupted by a read, but if the drive has to seek
+ *   before the next write, switch to reading instead if there are any pending
+ *   read requests.
  * - Set the read speed according to current usage pattern. When only reading
  *   from the device, it's best to use the highest possible read speed, but
  *   when switching often between reading and writing, it's better to have the
  *   same read and write speeds.
- * - Reads originating from user space should have higher priority than reads
- *   originating from pkt_gather_data, because some process is usually waiting
- *   on reads of the first kind.
  */
 static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 {
@@ -512,21 +511,24 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 
 	for (;;) {
 		struct bio *bio;
-		int reads_queued, writes_queued, high_prio_read;
+		int reads_queued, writes_queued;
 
 		spin_lock(&pd->iosched.lock);
 		reads_queued = (pd->iosched.read_queue != NULL);
 		writes_queued = (pd->iosched.write_queue != NULL);
-		if (!reads_queued)
-			pd->iosched.high_prio_read = 0;
-		high_prio_read = pd->iosched.high_prio_read;
 		spin_unlock(&pd->iosched.lock);
 
 		if (!reads_queued && !writes_queued)
 			break;
 
 		if (pd->iosched.writing) {
-			if (high_prio_read || (!writes_queued && reads_queued)) {
+			int need_write_seek = 1;
+			spin_lock(&pd->iosched.lock);
+			bio = pd->iosched.write_queue;
+			spin_unlock(&pd->iosched.lock);
+			if (bio && (bio->bi_sector == pd->iosched.last_write))
+				need_write_seek = 0;
+			if (need_write_seek && reads_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
 					VPRINTK("pktcdvd: write, waiting\n");
 					break;
@@ -559,8 +561,10 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 
 		if (bio_data_dir(bio) == READ)
 			pd->iosched.successive_reads += bio->bi_size >> 10;
-		else
+		else {
 			pd->iosched.successive_reads = 0;
+			pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+		}
 		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
 			if (pd->read_speed == pd->write_speed) {
 				pd->read_speed = MAX_SPEED;
@@ -765,7 +769,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 		atomic_inc(&pkt->io_wait);
 		bio->bi_rw = READ;
-		pkt_queue_bio(pd, bio, 0);
+		pkt_queue_bio(pd, bio);
 		frames_read++;
 	}
 
@@ -1062,7 +1066,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 	atomic_set(&pkt->io_wait, 1);
 	pkt->w_bio->bi_rw = WRITE;
-	pkt_queue_bio(pd, pkt->w_bio, 0);
+	pkt_queue_bio(pd, pkt->w_bio);
 }
 
 static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
@@ -2120,7 +2124,7 @@ static int pkt_make_request(request_queue_t *q, struct bio *bio)
 		cloned_bio->bi_private = psd;
 		cloned_bio->bi_end_io = pkt_end_io_read_cloned;
 		pd->stats.secs_r += bio->bi_size >> 9;
-		pkt_queue_bio(pd, cloned_bio, 1);
+		pkt_queue_bio(pd, cloned_bio);
 		return 0;
 	}
 
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
index 4e2d2a942ecb..4b32bce9a289 100644
--- a/include/linux/pktcdvd.h
+++ b/include/linux/pktcdvd.h
@@ -159,7 +159,7 @@ struct packet_iosched
 	struct bio		*read_queue_tail;
 	struct bio		*write_queue;
 	struct bio		*write_queue_tail;
-	int			high_prio_read;	/* An important read request has been queued */
+	sector_t		last_write;	/* The sector where the last write ended */
 	int			successive_reads;
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From bb93e3a52f8db7210258a1a2134cced0b78a46e1 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 23 Jun 2005 00:10:15 -0700
Subject: [PATCH] block: add unlocked_ioctl support for block devices

This patch allows block device drivers to convert their ioctl functions to
unlocked_ioctl() like character devices and other subsystems.  All
functions that were called with the BKL held before are still used that
way, but I would not be surprised if it could be removed from the ioctl
functions in drivers/block/ioctl.c themselves.

As a side note, I found that compat_blkdev_ioctl() acquires the BKL as
well, which looks like a bug.  I have checked that every user of
disk->fops->compat_ioctl() in the current git tree gets the BKL itself, so
it could easily be removed from compat_blkdev_ioctl().

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ioctl.c | 74 +++++++++++++++++++++++++++++++++++++--------------
 fs/block_dev.c        |  5 ++--
 include/linux/fs.h    |  1 +
 3 files changed, 57 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ioctl.c b/drivers/block/ioctl.c
index 6d7bcc9da9e7..6e278474f9a8 100644
--- a/drivers/block/ioctl.c
+++ b/drivers/block/ioctl.c
@@ -133,11 +133,9 @@ static int put_u64(unsigned long arg, u64 val)
 	return put_user(val, (u64 __user *)arg);
 }
 
-int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
-			unsigned long arg)
+static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
+				unsigned cmd, unsigned long arg)
 {
-	struct block_device *bdev = inode->i_bdev;
-	struct gendisk *disk = bdev->bd_disk;
 	struct backing_dev_info *bdi;
 	int ret, n;
 
@@ -190,36 +188,72 @@ int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	}
+	return -ENOIOCTLCMD;
+}
+
+static int blkdev_driver_ioctl(struct inode *inode, struct file *file,
+		struct gendisk *disk, unsigned cmd, unsigned long arg)
+{
+	int ret;
+	if (disk->fops->unlocked_ioctl)
+		return disk->fops->unlocked_ioctl(file, cmd, arg);
+
+	if (disk->fops->ioctl) {
+		lock_kernel();
+		ret = disk->fops->ioctl(inode, file, cmd, arg);
+		unlock_kernel();
+		return ret;
+	}
+
+	return -ENOTTY;
+}
+
+int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
+			unsigned long arg)
+{
+	struct block_device *bdev = inode->i_bdev;
+	struct gendisk *disk = bdev->bd_disk;
+	int ret, n;
+
+	switch(cmd) {
 	case BLKFLSBUF:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
-		if (disk->fops->ioctl) {
-			ret = disk->fops->ioctl(inode, file, cmd, arg);
-			/* -EINVAL to handle old uncorrected drivers */
-			if (ret != -EINVAL && ret != -ENOTTY)
-				return ret;
-		}
+
+		ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
+		/* -EINVAL to handle old uncorrected drivers */
+		if (ret != -EINVAL && ret != -ENOTTY)
+			return ret;
+
+		lock_kernel();
 		fsync_bdev(bdev);
 		invalidate_bdev(bdev, 0);
+		unlock_kernel();
 		return 0;
+
 	case BLKROSET:
-		if (disk->fops->ioctl) {
-			ret = disk->fops->ioctl(inode, file, cmd, arg);
-			/* -EINVAL to handle old uncorrected drivers */
-			if (ret != -EINVAL && ret != -ENOTTY)
-				return ret;
-		}
+		ret = blkdev_driver_ioctl(inode, file, disk, cmd, arg);
+		/* -EINVAL to handle old uncorrected drivers */
+		if (ret != -EINVAL && ret != -ENOTTY)
+			return ret;
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		if (get_user(n, (int __user *)(arg)))
 			return -EFAULT;
+		lock_kernel();
 		set_device_ro(bdev, n);
+		unlock_kernel();
 		return 0;
-	default:
-		if (disk->fops->ioctl)
-			return disk->fops->ioctl(inode, file, cmd, arg);
 	}
-	return -ENOTTY;
+
+	lock_kernel();
+	ret = blkdev_locked_ioctl(file, bdev, cmd, arg);
+	unlock_kernel();
+	if (ret != -ENOIOCTLCMD)
+		return ret;
+
+	return blkdev_driver_ioctl(inode, file, disk, cmd, arg);
 }
 
 /* Most of the generic ioctls are handled in the normal fallback path.
diff --git a/fs/block_dev.c b/fs/block_dev.c
index c0cbd1bc1a02..e0df94c37b7e 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -777,8 +777,7 @@ static ssize_t blkdev_file_aio_write(struct kiocb *iocb, const char __user *buf,
 	return generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
 }
 
-static int block_ioctl(struct inode *inode, struct file *file, unsigned cmd,
-			unsigned long arg)
+static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
 }
@@ -803,7 +802,7 @@ struct file_operations def_blk_fops = {
   	.aio_write	= blkdev_file_aio_write, 
 	.mmap		= generic_file_mmap,
 	.fsync		= block_fsync,
-	.ioctl		= block_ioctl,
+	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3622e952e98c..9b1278e21279 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -884,6 +884,7 @@ struct block_device_operations {
 	int (*open) (struct inode *, struct file *);
 	int (*release) (struct inode *, struct file *);
 	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
+	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
-- 
cgit v1.2.3-59-g8ed1b


From 45778ca819accab1a4a3378b3566cab0f189164f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@graphe.net>
Date: Thu, 23 Jun 2005 00:10:17 -0700
Subject: [PATCH] Remove f_error field from struct file

The following patch removes the f_error field and all checks of f_error.

Trond said:

  f_error was introduced for NFS, and made sense when we were guaranteed
  always to have a file pointer around when write errors occurred.  Since
  then, we have (for various reasons) had to introduce the nfs_open_context in
  order to track the file read/write state, and it made sense to move our
  f_error tracking there too.

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Acked-by: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/direct.c    |  5 -----
 fs/open.c          | 16 ++++------------
 include/linux/fs.h |  1 -
 mm/filemap.c       |  6 ------
 4 files changed, 4 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index d6a30c844de3..6537f2c4ae44 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -751,11 +751,6 @@ nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count,
 	retval = -EFAULT;
 	if (!access_ok(VERIFY_READ, iov.iov_base, iov.iov_len))
 		goto out;
-        if (file->f_error) {
-                retval = file->f_error;
-                file->f_error = 0;
-                goto out;
-        }
 	retval = -EFBIG;
 	if (limit != RLIM_INFINITY) {
 		if (pos >= limit) {
diff --git a/fs/open.c b/fs/open.c
index 2ebb72c1a876..5dd411b084bf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -981,23 +981,15 @@ asmlinkage long sys_creat(const char __user * pathname, int mode)
  */
 int filp_close(struct file *filp, fl_owner_t id)
 {
-	int retval;
-
-	/* Report and clear outstanding errors */
-	retval = filp->f_error;
-	if (retval)
-		filp->f_error = 0;
+	int retval = 0;
 
 	if (!file_count(filp)) {
 		printk(KERN_ERR "VFS: Close: file count is 0\n");
-		return retval;
+		return 0;
 	}
 
-	if (filp->f_op && filp->f_op->flush) {
-		int err = filp->f_op->flush(filp);
-		if (!retval)
-			retval = err;
-	}
+	if (filp->f_op && filp->f_op->flush)
+		retval = filp->f_op->flush(filp);
 
 	dnotify_flush(filp, id);
 	locks_remove_posix(filp, id);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9b1278e21279..517bf4966bf5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -581,7 +581,6 @@ struct file {
 	atomic_t		f_count;
 	unsigned int 		f_flags;
 	mode_t			f_mode;
-	int			f_error;
 	loff_t			f_pos;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4a2fee2cb62b..a3598b542a31 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1827,12 +1827,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
         if (unlikely(*pos < 0))
                 return -EINVAL;
 
-        if (unlikely(file->f_error)) {
-                int err = file->f_error;
-                file->f_error = 0;
-                return err;
-        }
-
 	if (!isblk) {
 		/* FIXME: this is for backwards compatibility with 2.4 */
 		if (file->f_flags & O_APPEND)
-- 
cgit v1.2.3-59-g8ed1b


From f9fd27a253d5e0b23531d12ce7ad15b6535d4486 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] acl endianess annotations

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix_acl_xattr.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index 5efd0a6dad94..fe271c1947b2 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -23,13 +23,13 @@
 #define ACL_UNDEFINED_ID	(-1)
 
 typedef struct {
-	__u16			e_tag;
-	__u16			e_perm;
-	__u32			e_id;
+	__le16			e_tag;
+	__le16			e_perm;
+	__le32			e_id;
 } posix_acl_xattr_entry;
 
 typedef struct {
-	__u32			a_version;
+	__le32			a_version;
 	posix_acl_xattr_entry	a_entries[0];
 } posix_acl_xattr_header;
 
-- 
cgit v1.2.3-59-g8ed1b


From 9a59f452abe11f569e13ec16c51e6d61c54b9838 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 00:10:19 -0700
Subject: [PATCH] remove <linux/xattr_acl.h>

This file duplicates <linux/posix_acl_xattr.h>, using slightly different
names.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/boot/simple/misc.c          |  2 +-
 arch/ppc/boot/simple/mpc10x_memory.c |  2 +-
 fs/ext2/acl.c                        | 12 ++++++------
 fs/ext2/acl.h                        |  2 +-
 fs/ext3/acl.c                        | 12 ++++++------
 fs/ext3/acl.h                        |  2 +-
 fs/jfs/acl.c                         | 11 ++++++-----
 fs/jfs/jfs_acl.h                     |  2 --
 fs/jfs/super.c                       |  1 +
 fs/jfs/xattr.c                       |  7 ++++---
 fs/nfsd/vfs.c                        |  9 ++++-----
 fs/reiserfs/xattr_acl.c              | 26 +++++++++++++-------------
 include/linux/posix_acl_xattr.h      |  3 +++
 include/linux/reiserfs_acl.h         |  1 -
 14 files changed, 47 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/boot/simple/misc.c b/arch/ppc/boot/simple/misc.c
index ab0f9902cb67..e02de5b467a4 100644
--- a/arch/ppc/boot/simple/misc.c
+++ b/arch/ppc/boot/simple/misc.c
@@ -222,7 +222,7 @@ decompress_kernel(unsigned long load_addr, int num_words, unsigned long cksum)
 	puts("\n");
 
 	puts("Uncompressing Linux...");
-	gunzip(0x0, 0x400000, zimage_start, &zimage_size);
+	gunzip(NULL, 0x400000, zimage_start, &zimage_size);
 	puts("done.\n");
 
 	/* get the bi_rec address */
diff --git a/arch/ppc/boot/simple/mpc10x_memory.c b/arch/ppc/boot/simple/mpc10x_memory.c
index 977daedc14c0..20d92a34ceb8 100644
--- a/arch/ppc/boot/simple/mpc10x_memory.c
+++ b/arch/ppc/boot/simple/mpc10x_memory.c
@@ -33,7 +33,7 @@
 
 #define MPC10X_PCI_OP(rw, size, type, op, mask)			 	\
 static void								\
-mpc10x_##rw##_config_##size(unsigned int *cfg_addr, 			\
+mpc10x_##rw##_config_##size(unsigned int __iomem *cfg_addr, 			\
 		unsigned int *cfg_data, int devfn, int offset,		\
 		type val)						\
 {									\
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index 25f4a64fd6bc..213148c36ebe 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -396,12 +396,12 @@ static size_t
 ext2_xattr_list_acl_access(struct inode *inode, char *list, size_t list_size,
 			   const char *name, size_t name_len)
 {
-	const size_t size = sizeof(XATTR_NAME_ACL_ACCESS);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
 	if (list && size <= list_size)
-		memcpy(list, XATTR_NAME_ACL_ACCESS, size);
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
 	return size;
 }
 
@@ -409,12 +409,12 @@ static size_t
 ext2_xattr_list_acl_default(struct inode *inode, char *list, size_t list_size,
 			    const char *name, size_t name_len)
 {
-	const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
 	if (list && size <= list_size)
-		memcpy(list, XATTR_NAME_ACL_DEFAULT, size);
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
 	return size;
 }
 
@@ -506,14 +506,14 @@ ext2_xattr_set_acl_default(struct inode *inode, const char *name,
 }
 
 struct xattr_handler ext2_xattr_acl_access_handler = {
-	.prefix	= XATTR_NAME_ACL_ACCESS,
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.list	= ext2_xattr_list_acl_access,
 	.get	= ext2_xattr_get_acl_access,
 	.set	= ext2_xattr_set_acl_access,
 };
 
 struct xattr_handler ext2_xattr_acl_default_handler = {
-	.prefix	= XATTR_NAME_ACL_DEFAULT,
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.list	= ext2_xattr_list_acl_default,
 	.get	= ext2_xattr_get_acl_default,
 	.set	= ext2_xattr_set_acl_default,
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index fed96ae81a7d..0bde85bafe38 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -4,7 +4,7 @@
   (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 
-#include <linux/xattr_acl.h>
+#include <linux/posix_acl_xattr.h>
 
 #define EXT2_ACL_VERSION	0x0001
 
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 638c13a26c03..133f5aa581bb 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -417,12 +417,12 @@ static size_t
 ext3_xattr_list_acl_access(struct inode *inode, char *list, size_t list_len,
 			   const char *name, size_t name_len)
 {
-	const size_t size = sizeof(XATTR_NAME_ACL_ACCESS);
+	const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
 	if (list && size <= list_len)
-		memcpy(list, XATTR_NAME_ACL_ACCESS, size);
+		memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
 	return size;
 }
 
@@ -430,12 +430,12 @@ static size_t
 ext3_xattr_list_acl_default(struct inode *inode, char *list, size_t list_len,
 			    const char *name, size_t name_len)
 {
-	const size_t size = sizeof(XATTR_NAME_ACL_DEFAULT);
+	const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
 
 	if (!test_opt(inode->i_sb, POSIX_ACL))
 		return 0;
 	if (list && size <= list_len)
-		memcpy(list, XATTR_NAME_ACL_DEFAULT, size);
+		memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
 	return size;
 }
 
@@ -535,14 +535,14 @@ ext3_xattr_set_acl_default(struct inode *inode, const char *name,
 }
 
 struct xattr_handler ext3_xattr_acl_access_handler = {
-	.prefix	= XATTR_NAME_ACL_ACCESS,
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
 	.list	= ext3_xattr_list_acl_access,
 	.get	= ext3_xattr_get_acl_access,
 	.set	= ext3_xattr_set_acl_access,
 };
 
 struct xattr_handler ext3_xattr_acl_default_handler = {
-	.prefix	= XATTR_NAME_ACL_DEFAULT,
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
 	.list	= ext3_xattr_list_acl_default,
 	.get	= ext3_xattr_get_acl_default,
 	.set	= ext3_xattr_set_acl_default,
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 98af0c0d0ba9..92d50b53a933 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -4,7 +4,7 @@
   (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org>
 */
 
-#include <linux/xattr_acl.h>
+#include <linux/posix_acl_xattr.h>
 
 #define EXT3_ACL_VERSION	0x0001
 
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 30a2bf9eeda5..e892dab40c26 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/quotaops.h>
+#include <linux/posix_acl_xattr.h>
 #include "jfs_incore.h"
 #include "jfs_xattr.h"
 #include "jfs_acl.h"
@@ -36,11 +37,11 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			ea_name = XATTR_NAME_ACL_ACCESS;
+			ea_name = POSIX_ACL_XATTR_ACCESS;
 			p_acl = &ji->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
-			ea_name = XATTR_NAME_ACL_DEFAULT;
+			ea_name = POSIX_ACL_XATTR_DEFAULT;
 			p_acl = &ji->i_default_acl;
 			break;
 		default:
@@ -88,11 +89,11 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			ea_name = XATTR_NAME_ACL_ACCESS;
+			ea_name = POSIX_ACL_XATTR_ACCESS;
 			p_acl = &ji->i_acl;
 			break;
 		case ACL_TYPE_DEFAULT:
-			ea_name = XATTR_NAME_ACL_DEFAULT;
+			ea_name = POSIX_ACL_XATTR_DEFAULT;
 			p_acl = &ji->i_default_acl;
 			if (!S_ISDIR(inode->i_mode))
 				return acl ? -EACCES : 0;
@@ -101,7 +102,7 @@ static int jfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 			return -EINVAL;
 	}
 	if (acl) {
-		size = xattr_acl_size(acl->a_count);
+		size = posix_acl_xattr_size(acl->a_count);
 		value = kmalloc(size, GFP_KERNEL);
 		if (!value)
 			return -ENOMEM;
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index d2ae430adecf..a3acd3eec059 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,8 +20,6 @@
 
 #ifdef CONFIG_JFS_POSIX_ACL
 
-#include <linux/xattr_acl.h>
-
 int jfs_permission(struct inode *, int, struct nameidata *);
 int jfs_init_acl(struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 810a3653d8b3..ee32211288ce 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -24,6 +24,7 @@
 #include <linux/completion.h>
 #include <linux/vfs.h>
 #include <linux/moduleparam.h>
+#include <linux/posix_acl.h>
 #include <asm/uaccess.h>
 
 #include "jfs_incore.h"
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index 6016373701a3..ee438d429d45 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -19,6 +19,7 @@
 
 #include <linux/fs.h>
 #include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/quotaops.h>
 #include "jfs_incore.h"
 #include "jfs_superblock.h"
@@ -718,9 +719,9 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		return -EPERM;
 
 	/*
-	 * XATTR_NAME_ACL_ACCESS is tied to i_mode
+	 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
 	 */
-	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0) {
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
 		acl = posix_acl_from_xattr(value, value_len);
 		if (IS_ERR(acl)) {
 			rc = PTR_ERR(acl);
@@ -750,7 +751,7 @@ static int can_set_system_xattr(struct inode *inode, const char *name,
 		JFS_IP(inode)->i_acl = JFS_ACL_NOT_CACHED;
 
 		return 0;
-	} else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0) {
+	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
 		acl = posix_acl_from_xattr(value, value_len);
 		if (IS_ERR(acl)) {
 			rc = PTR_ERR(acl);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ae3940dc85cc..de340ffd33c3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -50,7 +50,6 @@
 #include <linux/posix_acl.h>
 #ifdef CONFIG_NFSD_V4
 #include <linux/posix_acl_xattr.h>
-#include <linux/xattr_acl.h>
 #include <linux/xattr.h>
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
@@ -425,13 +424,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	if (pacl) {
-		error = set_nfsv4_acl_one(dentry, pacl, XATTR_NAME_ACL_ACCESS);
+		error = set_nfsv4_acl_one(dentry, pacl, POSIX_ACL_XATTR_ACCESS);
 		if (error < 0)
 			goto out_nfserr;
 	}
 
 	if (dpacl) {
-		error = set_nfsv4_acl_one(dentry, dpacl, XATTR_NAME_ACL_DEFAULT);
+		error = set_nfsv4_acl_one(dentry, dpacl, POSIX_ACL_XATTR_DEFAULT);
 		if (error < 0)
 			goto out_nfserr;
 	}
@@ -498,7 +497,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	struct posix_acl *pacl = NULL, *dpacl = NULL;
 	unsigned int flags = 0;
 
-	pacl = _get_posix_acl(dentry, XATTR_NAME_ACL_ACCESS);
+	pacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_ACCESS);
 	if (IS_ERR(pacl) && PTR_ERR(pacl) == -ENODATA)
 		pacl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
 	if (IS_ERR(pacl)) {
@@ -508,7 +507,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac
 	}
 
 	if (S_ISDIR(inode->i_mode)) {
-		dpacl = _get_posix_acl(dentry, XATTR_NAME_ACL_DEFAULT);
+		dpacl = _get_posix_acl(dentry, POSIX_ACL_XATTR_DEFAULT);
 		if (IS_ERR(dpacl) && PTR_ERR(dpacl) == -ENODATA)
 			dpacl = NULL;
 		else if (IS_ERR(dpacl)) {
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index e302071903a1..c312881c5f53 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -4,7 +4,7 @@
 #include <linux/errno.h>
 #include <linux/pagemap.h>
 #include <linux/xattr.h>
-#include <linux/xattr_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/reiserfs_xattr.h>
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
@@ -192,11 +192,11 @@ reiserfs_get_acl(struct inode *inode, int type)
 
         switch (type) {
             case ACL_TYPE_ACCESS:
-                name = XATTR_NAME_ACL_ACCESS;
+                name = POSIX_ACL_XATTR_ACCESS;
                 p_acl = &reiserfs_i->i_acl_access;
                 break;
             case ACL_TYPE_DEFAULT:
-                name = XATTR_NAME_ACL_DEFAULT;
+                name = POSIX_ACL_XATTR_DEFAULT;
                 p_acl = &reiserfs_i->i_acl_default;
                 break;
             default:
@@ -260,7 +260,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 
         switch (type) {
             case ACL_TYPE_ACCESS:
-                name = XATTR_NAME_ACL_ACCESS;
+                name = POSIX_ACL_XATTR_ACCESS;
                 p_acl = &reiserfs_i->i_acl_access;
                 if (acl) {
                     mode_t mode = inode->i_mode;
@@ -275,7 +275,7 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                 }
                 break;
             case ACL_TYPE_DEFAULT:
-                name = XATTR_NAME_ACL_DEFAULT;
+                name = POSIX_ACL_XATTR_DEFAULT;
                 p_acl = &reiserfs_i->i_acl_default;
                 if (!S_ISDIR (inode->i_mode))
                     return acl ? -EACCES : 0;
@@ -468,7 +468,7 @@ static int
 posix_acl_access_get(struct inode *inode, const char *name,
 			  void *buffer, size_t size)
 {
-	if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
 		return -EINVAL;
 	return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
 }
@@ -477,7 +477,7 @@ static int
 posix_acl_access_set(struct inode *inode, const char *name,
 			  const void *value, size_t size, int flags)
 {
-	if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
 		return -EINVAL;
 	return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
@@ -487,7 +487,7 @@ posix_acl_access_del (struct inode *inode, const char *name)
 {
     struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
     struct posix_acl **acl = &reiserfs_i->i_acl_access;
-    if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1)
+    if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
 	return -EINVAL;
     if (!IS_ERR (*acl) && *acl) {
         posix_acl_release (*acl);
@@ -510,7 +510,7 @@ posix_acl_access_list (struct inode *inode, const char *name, int namelen, char
 }
 
 struct reiserfs_xattr_handler posix_acl_access_handler = {
-	.prefix = XATTR_NAME_ACL_ACCESS,
+	.prefix = POSIX_ACL_XATTR_ACCESS,
 	.get = posix_acl_access_get,
 	.set = posix_acl_access_set,
 	.del = posix_acl_access_del,
@@ -521,7 +521,7 @@ static int
 posix_acl_default_get (struct inode *inode, const char *name,
 			   void *buffer, size_t size)
 {
-	if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
 		return -EINVAL;
 	return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
 }
@@ -530,7 +530,7 @@ static int
 posix_acl_default_set(struct inode *inode, const char *name,
 			   const void *value, size_t size, int flags)
 {
-	if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
 		return -EINVAL;
 	return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
@@ -540,7 +540,7 @@ posix_acl_default_del (struct inode *inode, const char *name)
 {
     struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
     struct posix_acl **acl = &reiserfs_i->i_acl_default;
-    if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1)
+    if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
 	return -EINVAL;
     if (!IS_ERR (*acl) && *acl) {
         posix_acl_release (*acl);
@@ -563,7 +563,7 @@ posix_acl_default_list (struct inode *inode, const char *name, int namelen, char
 }
 
 struct reiserfs_xattr_handler posix_acl_default_handler = {
-	.prefix = XATTR_NAME_ACL_DEFAULT,
+	.prefix = POSIX_ACL_XATTR_DEFAULT,
 	.get = posix_acl_default_get,
 	.set = posix_acl_default_set,
 	.del = posix_acl_default_del,
diff --git a/include/linux/posix_acl_xattr.h b/include/linux/posix_acl_xattr.h
index fe271c1947b2..6e53c34035cd 100644
--- a/include/linux/posix_acl_xattr.h
+++ b/include/linux/posix_acl_xattr.h
@@ -52,4 +52,7 @@ posix_acl_xattr_count(size_t size)
 	return size / sizeof(posix_acl_xattr_entry);
 }
 
+struct posix_acl *posix_acl_from_xattr(const void *value, size_t size);
+int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size);
+
 #endif	/* _POSIX_ACL_XATTR_H */
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 2aef9c3f5ce8..0760507a545b 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -1,6 +1,5 @@
 #include <linux/init.h>
 #include <linux/posix_acl.h>
-#include <linux/xattr_acl.h>
 
 #define REISERFS_ACL_VERSION	0x0001
 
-- 
cgit v1.2.3-59-g8ed1b


From c43dc2fd885b5658cfd7cedb7bcca20910c517a4 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Thu, 23 Jun 2005 00:10:27 -0700
Subject: [PATCH] aio: make wait_queue ->task ->private

In the upcoming aio_down patch, it is useful to store a private data
pointer in the kiocb's wait_queue.  Since we provide our own wake up
function and do not require the task_struct pointer, it makes sense to
convert the task pointer into a generic private pointer.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/wait.h | 16 ++++++++--------
 kernel/sched.c       |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index c9486c3efb4a..d38c9fecdc36 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -33,7 +33,7 @@ int default_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key
 struct __wait_queue {
 	unsigned int flags;
 #define WQ_FLAG_EXCLUSIVE	0x01
-	struct task_struct * task;
+	void *private;
 	wait_queue_func_t func;
 	struct list_head task_list;
 };
@@ -60,7 +60,7 @@ typedef struct __wait_queue_head wait_queue_head_t;
  */
 
 #define __WAITQUEUE_INITIALIZER(name, tsk) {				\
-	.task		= tsk,						\
+	.private	= tsk,						\
 	.func		= default_wake_function,			\
 	.task_list	= { NULL, NULL } }
 
@@ -86,7 +86,7 @@ static inline void init_waitqueue_head(wait_queue_head_t *q)
 static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
 {
 	q->flags = 0;
-	q->task = p;
+	q->private = p;
 	q->func = default_wake_function;
 }
 
@@ -94,7 +94,7 @@ static inline void init_waitqueue_func_entry(wait_queue_t *q,
 					wait_queue_func_t func)
 {
 	q->flags = 0;
-	q->task = NULL;
+	q->private = NULL;
 	q->func = func;
 }
 
@@ -110,7 +110,7 @@ static inline int waitqueue_active(wait_queue_head_t *q)
  * aio specifies a wait queue entry with an async notification
  * callback routine, not associated with any task.
  */
-#define is_sync_wait(wait)	(!(wait) || ((wait)->task))
+#define is_sync_wait(wait)	(!(wait) || ((wait)->private))
 
 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
@@ -384,7 +384,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define DEFINE_WAIT(name)						\
 	wait_queue_t name = {						\
-		.task		= current,				\
+		.private	= current,				\
 		.func		= autoremove_wake_function,		\
 		.task_list	= LIST_HEAD_INIT((name).task_list),	\
 	}
@@ -393,7 +393,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 	struct wait_bit_queue name = {					\
 		.key = __WAIT_BIT_KEY_INITIALIZER(word, bit),		\
 		.wait	= {						\
-			.task		= current,			\
+			.private	= current,			\
 			.func		= wake_bit_function,		\
 			.task_list	=				\
 				LIST_HEAD_INIT((name).wait.task_list),	\
@@ -402,7 +402,7 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
 
 #define init_wait(wait)							\
 	do {								\
-		(wait)->task = current;					\
+		(wait)->private = current;				\
 		(wait)->func = autoremove_wake_function;		\
 		INIT_LIST_HEAD(&(wait)->task_list);			\
 	} while (0)
diff --git a/kernel/sched.c b/kernel/sched.c
index 6ee4515d5a20..76080d142e3d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2869,7 +2869,7 @@ need_resched:
 
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
 {
-	task_t *p = curr->task;
+	task_t *p = curr->private;
 	return try_to_wake_up(p, mode, sync);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From bfb07599da289881d3bcbb601a110e997fc7444b Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 23 Jun 2005 00:10:32 -0700
Subject: [PATCH] Introduce tty_unregister_ldisc()

It's a bit strange to see tty_register_ldisc call in modules' exit
functions.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/tty.txt |  2 +-
 drivers/char/tty_io.c | 37 ++++++++++++++++++++++++-------------
 include/linux/tty.h   |  1 +
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/tty.txt b/Documentation/tty.txt
index 3958cf746dde..8ff7bc2a0811 100644
--- a/Documentation/tty.txt
+++ b/Documentation/tty.txt
@@ -22,7 +22,7 @@ copy of the structure. You must not re-register over the top of the line
 discipline even with the same data or your computer again will be eaten by
 demons.
 
-In order to remove a line discipline call tty_register_ldisc passing NULL.
+In order to remove a line discipline call tty_unregister_ldisc().
 In ancient times this always worked. In modern times the function will
 return -EBUSY if the ldisc is currently in use. Since the ldisc referencing
 code manages the module counts this should not usually be a concern.
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 31831030f73f..cc4b43bad703 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -251,7 +251,7 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
  
 static DEFINE_SPINLOCK(tty_ldisc_lock);
 static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait);
-static struct tty_ldisc tty_ldiscs[NR_LDISCS];	/* line disc dispatch table	*/
+static struct tty_ldisc tty_ldiscs[NR_LDISCS];	/* line disc dispatch table */
 
 int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 {
@@ -262,24 +262,35 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
 		return -EINVAL;
 	
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	if (new_ldisc) {
-		tty_ldiscs[disc] = *new_ldisc;
-		tty_ldiscs[disc].num = disc;
-		tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED;
-		tty_ldiscs[disc].refcount = 0;
-	} else {
-		if(tty_ldiscs[disc].refcount)
-			ret = -EBUSY;
-		else
-			tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED;
-	}
+	tty_ldiscs[disc] = *new_ldisc;
+	tty_ldiscs[disc].num = disc;
+	tty_ldiscs[disc].flags |= LDISC_FLAG_DEFINED;
+	tty_ldiscs[disc].refcount = 0;
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 	
 	return ret;
 }
-
 EXPORT_SYMBOL(tty_register_ldisc);
 
+int tty_unregister_ldisc(int disc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (disc < N_TTY || disc >= NR_LDISCS)
+		return -EINVAL;
+
+	spin_lock_irqsave(&tty_ldisc_lock, flags);
+	if (tty_ldiscs[disc].refcount)
+		ret = -EBUSY;
+	else
+		tty_ldiscs[disc].flags &= ~LDISC_FLAG_DEFINED;
+	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(tty_unregister_ldisc);
+
 struct tty_ldisc *tty_ldisc_get(int disc)
 {
 	unsigned long flags;
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1b76106272d3..59ff42c629ec 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -345,6 +345,7 @@ extern int tty_check_change(struct tty_struct * tty);
 extern void stop_tty(struct tty_struct * tty);
 extern void start_tty(struct tty_struct * tty);
 extern int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc);
+extern int tty_unregister_ldisc(int disc);
 extern int tty_register_driver(struct tty_driver *driver);
 extern int tty_unregister_driver(struct tty_driver *driver);
 extern void tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev);
-- 
cgit v1.2.3-59-g8ed1b


From 4749f32da939d4e4160541b2cadc22492bb507ec Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 11:36:56 +0200
Subject: [PATCH] better USB_MON dependencies

This makes the USB_MON less confusing.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/hcd.c   |  2 +-
 drivers/usb/core/hcd.h   |  2 +-
 drivers/usb/mon/Kconfig  | 13 ++++---------
 drivers/usb/mon/Makefile |  2 +-
 include/linux/usb.h      |  2 +-
 5 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index d041782e0c8b..0da23732e807 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1794,7 +1794,7 @@ EXPORT_SYMBOL (usb_remove_hcd);
 
 /*-------------------------------------------------------------------------*/
 
-#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
+#if defined(CONFIG_USB_MON)
 
 struct usb_mon_operations *mon_ops;
 
diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index f67cf1e634fc..325a51656c3f 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -399,7 +399,7 @@ static inline void usbfs_cleanup(void) { }
 
 /*-------------------------------------------------------------------------*/
 
-#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
+#if defined(CONFIG_USB_MON)
 
 struct usb_mon_operations {
 	void (*urb_submit)(struct usb_bus *bus, struct urb *urb);
diff --git a/drivers/usb/mon/Kconfig b/drivers/usb/mon/Kconfig
index 4e6152aa5f19..777642e26b9a 100644
--- a/drivers/usb/mon/Kconfig
+++ b/drivers/usb/mon/Kconfig
@@ -2,13 +2,9 @@
 # USB Monitor configuration
 #
 
-# In normal life, it makes little sense to have usbmon as a module, and in fact
-# it is harmful, because there is no way to autoload the module.
-# The 'm' option is allowed for hackers who debug the usbmon itself,
-# and for those who have usbcore as a module.
 config USB_MON
-	tristate "USB Monitor"
-	depends on USB
+	bool "USB Monitor"
+	depends on USB!=n
 	default y
 	help
 	  If you say Y here, a component which captures the USB traffic
@@ -17,6 +13,5 @@ config USB_MON
 	  Harding's USBMon.
 
 	  This is somewhat experimental at this time, but it should be safe,
-	  as long as you aren't building this as a module and then removing it.
-
-	  If unsure, say Y. Do not say M.
+	  as long as you aren't using modular USB and try to remove this
+	  module.
diff --git a/drivers/usb/mon/Makefile b/drivers/usb/mon/Makefile
index 3cff8d444bb1..f18d10ce91f9 100644
--- a/drivers/usb/mon/Makefile
+++ b/drivers/usb/mon/Makefile
@@ -4,4 +4,4 @@
 
 usbmon-objs	:= mon_main.o mon_stat.o mon_text.o
 
-obj-$(CONFIG_USB_MON)	+= usbmon.o
+obj-$(CONFIG_USB)	+= usbmon.o
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 3d508bf08402..eb282b581546 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -290,7 +290,7 @@ struct usb_bus {
 	struct class_device *class_dev;	/* class device for this bus */
 	struct kref kref;		/* handles reference counting this bus */
 	void (*release)(struct usb_bus *bus);	/* function to destroy this bus's memory */
-#if defined(CONFIG_USB_MON) || defined(CONFIG_USB_MON_MODULE)
+#if defined(CONFIG_USB_MON)
 	struct mon_bus *mon_bus;	/* non-null when associated */
 	int monitored;			/* non-zero when monitored */
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:19:55 -0700
Subject: [TCP]: Add pluggable congestion control algorithm infrastructure.

Allow TCP to have multiple pluggable congestion control algorithms.
Algorithms are defined by a set of operations and can be built in
or modules.  The legacy "new RENO" algorithm is used as a starting
point and fallback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     |   9 +-
 include/linux/tcp.h        |  49 +--
 include/net/tcp.h          | 237 +++++----------
 net/ipv4/Makefile          |   3 +-
 net/ipv4/sysctl_net_ipv4.c | 114 +++----
 net/ipv4/tcp.c             |   2 +
 net/ipv4/tcp_cong.c        | 195 ++++++++++++
 net/ipv4/tcp_diag.c        |  20 +-
 net/ipv4/tcp_input.c       | 737 ++++-----------------------------------------
 net/ipv4/tcp_ipv4.c        |   3 +
 net/ipv4/tcp_minisocks.c   |   4 +-
 net/ipv4/tcp_output.c      |  23 +-
 net/ipv6/tcp_ipv6.c        |   2 +-
 13 files changed, 399 insertions(+), 999 deletions(-)
 create mode 100644 net/ipv4/tcp_cong.c

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 614e939c78a4..72965bfe6cfb 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -333,21 +333,14 @@ enum
 	NET_TCP_FRTO=92,
 	NET_TCP_LOW_LATENCY=93,
 	NET_IPV4_IPFRAG_SECRET_INTERVAL=94,
-	NET_TCP_WESTWOOD=95,
 	NET_IPV4_IGMP_MAX_MSF=96,
 	NET_TCP_NO_METRICS_SAVE=97,
-	NET_TCP_VEGAS=98,
-	NET_TCP_VEGAS_ALPHA=99,
-	NET_TCP_VEGAS_BETA=100,
-	NET_TCP_VEGAS_GAMMA=101,
- 	NET_TCP_BIC=102,
- 	NET_TCP_BIC_FAST_CONVERGENCE=103,
-	NET_TCP_BIC_LOW_WINDOW=104,
 	NET_TCP_DEFAULT_WIN_SCALE=105,
 	NET_TCP_MODERATE_RCVBUF=106,
 	NET_TCP_TSO_WIN_DIVISOR=107,
 	NET_TCP_BIC_BETA=108,
 	NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
+	NET_TCP_CONG_CONTROL=110,
 };
 
 enum {
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 97a7c9e03df5..3ea75dd6640a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -203,13 +203,6 @@ struct tcp_sack_block {
 	__u32	end_seq;
 };
 
-enum tcp_congestion_algo {
-	TCP_RENO=0,
-	TCP_VEGAS,
-	TCP_WESTWOOD,
-	TCP_BIC,
-};
-
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -305,7 +298,7 @@ struct tcp_sock {
 	__u8	reordering;	/* Packet reordering metric.		*/
 	__u8	frto_counter;	/* Number of new acks after RTO */
 
-	__u8	adv_cong;	/* Using Vegas, Westwood, or BIC */
+	__u8	unused;
 	__u8	defer_accept;	/* User waits for some data after accept() */
 
 /* RTT measurement */
@@ -401,37 +394,10 @@ struct tcp_sock {
 		__u32	time;
 	} rcvq_space;
 
-/* TCP Westwood structure */
-        struct {
-                __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
-                __u32    bw_est;           /* bandwidth estimate */
-                __u32    rtt_win_sx;       /* here starts a new evaluation... */
-                __u32    bk;
-                __u32    snd_una;          /* used for evaluating the number of acked bytes */
-                __u32    cumul_ack;
-                __u32    accounted;
-                __u32    rtt;
-                __u32    rtt_min;          /* minimum observed RTT */
-        } westwood;
-
-/* Vegas variables */
-	struct {
-		__u32	beg_snd_nxt;	/* right edge during last RTT */
-		__u32	beg_snd_una;	/* left edge  during last RTT */
-		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
-		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
-		__u16	cntRTT;		/* # of RTTs measured within last RTT */
-		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
-		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
-	} vegas;
-
-	/* BI TCP Parameters */
-	struct {
-		__u32	cnt;		/* increase cwnd by 1 after this number of ACKs */
-		__u32 	last_max_cwnd;	/* last maximium snd_cwnd */
-		__u32	last_cwnd;	/* the last snd_cwnd */
-		__u32   last_stamp;     /* time when updated last_cwnd */
-	} bictcp;
+	/* Pluggable TCP congestion control hook */
+	struct tcp_congestion_ops *ca_ops;
+	u32	ca_priv[16];
+#define TCP_CA_PRIV_SIZE	(16*sizeof(u32))
 };
 
 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
@@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline void *tcp_ca(const struct tcp_sock *tp)
+{
+	return (void *) tp->ca_priv;
+}
+
 #endif
 
 #endif	/* _LINUX_TCP_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f730935b824a..e427cf35915c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk)
 #else
 # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
 #endif
-
-#define BICTCP_BETA_SCALE    1024	/* Scale factor beta calculation
-					 * max_cwnd = snd_cwnd * beta
-					 */
-#define BICTCP_MAX_INCREMENT 32		/*
-					 * Limit on the amount of
-					 * increment allowed during
-					 * binary search.
-					 */
-#define BICTCP_FUNC_OF_MIN_INCR 11	/*
-					 * log(B/Smin)/log(B/(B-1))+1,
-					 * Smin:min increment
-					 * B:log factor
-					 */
-#define BICTCP_B		4	 /*
-					  * In binary search,
-					  * go to point (max+min)/N
-					  */
-
 /*
  *	TCP option
  */
@@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
-extern int sysctl_tcp_westwood;
-extern int sysctl_tcp_vegas_cong_avoid;
-extern int sysctl_tcp_vegas_alpha;
-extern int sysctl_tcp_vegas_beta;
-extern int sysctl_tcp_vegas_gamma;
 extern int sysctl_tcp_nometrics_save;
-extern int sysctl_tcp_bic;
-extern int sysctl_tcp_bic_fast_convergence;
-extern int sysctl_tcp_bic_low_window;
-extern int sysctl_tcp_bic_beta;
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 
@@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp,
 	tp->packets_out -= tcp_skb_pcount(skb);
 }
 
+/* Events passed to congestion control interface */
+enum tcp_ca_event {
+	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
+	CA_EVENT_CWND_RESTART,	/* congestion window restart */
+	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
+	CA_EVENT_FRTO,		/* fast recovery timeout */
+	CA_EVENT_LOSS,		/* loss timeout */
+	CA_EVENT_FAST_ACK,	/* in sequence ack */
+	CA_EVENT_SLOW_ACK,	/* other ack */
+};
+
+/*
+ * Interface for adding new TCP congestion control handlers
+ */
+#define TCP_CA_NAME_MAX	16
+struct tcp_congestion_ops {
+	struct list_head	list;
+
+	/* initialize private data (optional) */
+	void (*init)(struct tcp_sock *tp);
+	/* cleanup private data  (optional) */
+	void (*release)(struct tcp_sock *tp);
+
+	/* return slow start threshold (required) */
+	u32 (*ssthresh)(struct tcp_sock *tp);
+	/* lower bound for congestion window (optional) */
+	u32 (*min_cwnd)(struct tcp_sock *tp);
+	/* do new cwnd calculation (required) */
+	void (*cong_avoid)(struct tcp_sock *tp, u32 ack,
+			   u32 rtt, u32 in_flight, int good_ack);
+	/* round trip time sample per acked packet (optional) */
+	void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt);
+	/* call before changing ca_state (optional) */
+	void (*set_state)(struct tcp_sock *tp, u8 new_state);
+	/* call when cwnd event occurs (optional) */
+	void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev);
+	/* new value of cwnd after loss (optional) */
+	u32  (*undo_cwnd)(struct tcp_sock *tp);
+	/* hook for packet ack accounting (optional) */
+	void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked);
+	/* get info for tcp_diag (optional) */
+	void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb);
+
+	char 		name[TCP_CA_NAME_MAX];
+	struct module 	*owner;
+};
+
+extern int tcp_register_congestion_control(struct tcp_congestion_ops *type);
+extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
+
+extern void tcp_init_congestion_control(struct tcp_sock *tp);
+extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
+extern int tcp_set_default_congestion_control(const char *name);
+extern void tcp_get_default_congestion_control(char *name);
+
+extern struct tcp_congestion_ops tcp_reno;
+extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
+extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
+				u32 rtt, u32 in_flight, int flag);
+extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp);
+
+static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
+{
+	if (tp->ca_ops->set_state)
+		tp->ca_ops->set_state(tp, ca_state);
+	tp->ca_state = ca_state;
+}
+
+static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event)
+{
+	if (tp->ca_ops->cwnd_event)
+		tp->ca_ops->cwnd_event(tp, event);
+}
+
 /* This determines how many packets are "in the network" to the best
  * of our knowledge.  In many cases it is conservative, but where
  * detailed information is available from the receiver (via SACK
@@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 	return (tp->packets_out - tp->left_out + tp->retrans_out);
 }
 
-/*
- * Which congestion algorithim is in use on the connection.
- */
-#define tcp_is_vegas(__tp)	((__tp)->adv_cong == TCP_VEGAS)
-#define tcp_is_westwood(__tp)	((__tp)->adv_cong == TCP_WESTWOOD)
-#define tcp_is_bic(__tp)	((__tp)->adv_cong == TCP_BIC)
-
-/* Recalculate snd_ssthresh, we want to set it to:
- *
- * Reno:
- * 	one half the current congestion window, but no
- *	less than two segments
- *
- * BIC:
- *	behave like Reno until low_window is reached,
- *	then increase congestion window slowly
- */
-static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp)
-{
-	if (tcp_is_bic(tp)) {
-		if (sysctl_tcp_bic_fast_convergence &&
-		    tp->snd_cwnd < tp->bictcp.last_max_cwnd)
-			tp->bictcp.last_max_cwnd = (tp->snd_cwnd * 
-						    (BICTCP_BETA_SCALE
-						     + sysctl_tcp_bic_beta))
-				/ (2 * BICTCP_BETA_SCALE);
-		else
-			tp->bictcp.last_max_cwnd = tp->snd_cwnd;
-
-		if (tp->snd_cwnd > sysctl_tcp_bic_low_window)
-			return max((tp->snd_cwnd * sysctl_tcp_bic_beta)
-				   / BICTCP_BETA_SCALE, 2U);
-	}
-
-	return max(tp->snd_cwnd >> 1U, 2U);
-}
-
-/* Stop taking Vegas samples for now. */
-#define tcp_vegas_disable(__tp)	((__tp)->vegas.doing_vegas_now = 0)
-    
-static inline void tcp_vegas_enable(struct tcp_sock *tp)
-{
-	/* There are several situations when we must "re-start" Vegas:
-	 *
-	 *  o when a connection is established
-	 *  o after an RTO
-	 *  o after fast recovery
-	 *  o when we send a packet and there is no outstanding
-	 *    unacknowledged data (restarting an idle connection)
-	 *
-	 * In these circumstances we cannot do a Vegas calculation at the
-	 * end of the first RTT, because any calculation we do is using
-	 * stale info -- both the saved cwnd and congestion feedback are
-	 * stale.
-	 *
-	 * Instead we must wait until the completion of an RTT during
-	 * which we actually receive ACKs.
-	 */
-    
-	/* Begin taking Vegas samples next time we send something. */
-	tp->vegas.doing_vegas_now = 1;
-     
-	/* Set the beginning of the next send window. */
-	tp->vegas.beg_snd_nxt = tp->snd_nxt;
-
-	tp->vegas.cntRTT = 0;
-	tp->vegas.minRTT = 0x7fffffff;
-}
-
-/* Should we be taking Vegas samples right now? */
-#define tcp_vegas_enabled(__tp)	((__tp)->vegas.doing_vegas_now)
-
-extern void tcp_ca_init(struct tcp_sock *tp);
-
-static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state)
-{
-	if (tcp_is_vegas(tp)) {
-		if (ca_state == TCP_CA_Open) 
-			tcp_vegas_enable(tp);
-		else
-			tcp_vegas_disable(tp);
-	}
-	tp->ca_state = ca_state;
-}
-
 /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd.
  * The exception is rate halving phase, when cwnd is decreasing towards
  * ssthresh.
@@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 static inline void __tcp_enter_cwr(struct tcp_sock *tp)
 {
 	tp->undo_marker = 0;
-	tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+	tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
 	tp->snd_cwnd = min(tp->snd_cwnd,
 			   tcp_packets_in_flight(tp) + 1U);
 	tp->snd_cwnd_cnt = 0;
@@ -1876,52 +1837,4 @@ struct tcp_iter_state {
 extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo);
 extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo);
 
-/* TCP Westwood functions and constants */
-
-#define TCP_WESTWOOD_INIT_RTT  (20*HZ)           /* maybe too conservative?! */
-#define TCP_WESTWOOD_RTT_MIN   (HZ/20)           /* 50ms */
-
-static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq)
-{
-        if (tcp_is_westwood(tp))
-                tp->westwood.rtt = rtt_seq;
-}
-
-static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
-        return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) /
-		   (__u32) (tp->mss_cache_std),
-		   2U);
-}
-
-static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp)
-{
-	return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0;
-}
-
-static inline int tcp_westwood_ssthresh(struct tcp_sock *tp)
-{
-	__u32 ssthresh = 0;
-
-	if (tcp_is_westwood(tp)) {
-		ssthresh = __tcp_westwood_bw_rttmin(tp);
-		if (ssthresh)
-			tp->snd_ssthresh = ssthresh;  
-	}
-
-	return (ssthresh != 0);
-}
-
-static inline int tcp_westwood_cwnd(struct tcp_sock *tp)
-{
-	__u32 cwnd = 0;
-
-	if (tcp_is_westwood(tp)) {
-		cwnd = __tcp_westwood_bw_rttmin(tp);
-		if (cwnd)
-			tp->snd_cwnd = cwnd;
-	}
-
-	return (cwnd != 0);
-}
 #endif	/* _TCP_H */
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 65d57d8e1add..89c0b4cb470e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -5,7 +5,8 @@
 obj-y     := utils.o route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o \
-	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+	     tcp_minisocks.o tcp_cong.o \
 	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o
 
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 23068bddbf0b..e32894532416 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table,
 	return 1;
 }
 
+static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp,
+				       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+
+	ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos);
+	if (write && ret == 0)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen,
+				  void __user *oldval, size_t __user *oldlenp,
+				  void __user *newval, size_t newlen,
+				  void **context)
+{
+	char val[TCP_CA_NAME_MAX];
+	ctl_table tbl = {
+		.data = val,
+		.maxlen = TCP_CA_NAME_MAX,
+	};
+	int ret;
+
+	tcp_get_default_congestion_control(val);
+	ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen,
+			    context);
+	if (ret == 0 && newval && newlen)
+		ret = tcp_set_default_congestion_control(val);
+	return ret;
+}
+
+
 ctl_table ipv4_table[] = {
         {
 		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
@@ -611,70 +650,6 @@ ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= NET_TCP_WESTWOOD, 
-		.procname	= "tcp_westwood",
-		.data		= &sysctl_tcp_westwood,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS,
-		.procname	= "tcp_vegas_cong_avoid",
-		.data		= &sysctl_tcp_vegas_cong_avoid,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_ALPHA,
-		.procname	= "tcp_vegas_alpha",
-		.data		= &sysctl_tcp_vegas_alpha,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_BETA,
-		.procname	= "tcp_vegas_beta",
-		.data		= &sysctl_tcp_vegas_beta,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_VEGAS_GAMMA,
-		.procname	= "tcp_vegas_gamma",
-		.data		= &sysctl_tcp_vegas_gamma,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC,
-		.procname	= "tcp_bic",
-		.data		= &sysctl_tcp_bic,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_FAST_CONVERGENCE,
-		.procname	= "tcp_bic_fast_convergence",
-		.data		= &sysctl_tcp_bic_fast_convergence,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= NET_TCP_BIC_LOW_WINDOW,
-		.procname	= "tcp_bic_low_window",
-		.data		= &sysctl_tcp_bic_low_window,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
 	{
 		.ctl_name	= NET_TCP_MODERATE_RCVBUF,
 		.procname	= "tcp_moderate_rcvbuf",
@@ -692,13 +667,14 @@ ctl_table ipv4_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
-		.ctl_name	= NET_TCP_BIC_BETA,
-		.procname	= "tcp_bic_beta",
-		.data		= &sysctl_tcp_bic_beta,
-		.maxlen		= sizeof(int),
+		.ctl_name	= NET_TCP_CONG_CONTROL,
+		.procname	= "tcp_congestion_control",
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.maxlen		= TCP_CA_NAME_MAX,
+		.proc_handler	= &proc_tcp_congestion_control,
+		.strategy	= &sysctl_tcp_congestion_control,
 	},
+
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 674bbd8cfd36..f3dbc8dc1263 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2333,6 +2333,8 @@ void __init tcp_init(void)
 	printk(KERN_INFO "TCP: Hash tables configured "
 	       "(established %d bind %d)\n",
 	       tcp_ehash_size << 1, tcp_bhash_size);
+
+	tcp_register_congestion_control(&tcp_reno);
 }
 
 EXPORT_SYMBOL(tcp_accept);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
new file mode 100644
index 000000000000..665394a63ae4
--- /dev/null
+++ b/net/ipv4/tcp_cong.c
@@ -0,0 +1,195 @@
+/*
+ * Plugable TCP congestion control support and newReno
+ * congestion control.
+ * Based on ideas from I/O scheduler suport and Web100.
+ *
+ * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/tcp.h>
+
+static DEFINE_SPINLOCK(tcp_cong_list_lock);
+static LIST_HEAD(tcp_cong_list);
+
+/* Simple linear search, don't expect many entries! */
+static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+{
+	struct tcp_congestion_ops *e;
+
+	list_for_each_entry(e, &tcp_cong_list, list) {
+		if (strcmp(e->name, name) == 0)
+			return e;
+	}
+
+	return NULL;
+}
+
+/*
+ * Attach new congestion control algorthim to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+	int ret = 0;
+
+	/* all algorithms must implement ssthresh and cong_avoid ops */
+	if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) {
+		printk(KERN_ERR "TCP %s does not implement required ops\n",
+		       ca->name);
+		return -EINVAL;
+	}
+
+	spin_lock(&tcp_cong_list_lock);
+	if (tcp_ca_find(ca->name)) {
+		printk(KERN_NOTICE "TCP %s already registered\n", ca->name);
+		ret = -EEXIST;
+	} else {
+		list_add_rcu(&ca->list, &tcp_cong_list);
+		printk(KERN_INFO "TCP %s registered\n", ca->name);
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_register_congestion_control);
+
+/*
+ * Remove congestion control algorithm, called from
+ * the module's remove function.  Module ref counts are used
+ * to ensure that this can't be done till all sockets using
+ * that method are closed.
+ */
+void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
+{
+	spin_lock(&tcp_cong_list_lock);
+	list_del_rcu(&ca->list);
+	spin_unlock(&tcp_cong_list_lock);
+}
+EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
+
+/* Assign choice of congestion control. */
+void tcp_init_congestion_control(struct tcp_sock *tp)
+{
+	struct tcp_congestion_ops *ca;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
+		if (try_module_get(ca->owner)) {
+			tp->ca_ops = ca;
+			break;
+		}
+
+	}
+	rcu_read_unlock();
+
+	if (tp->ca_ops->init)
+		tp->ca_ops->init(tp);
+}
+
+/* Manage refcounts on socket close. */
+void tcp_cleanup_congestion_control(struct tcp_sock *tp)
+{
+	if (tp->ca_ops->release)
+		tp->ca_ops->release(tp);
+	module_put(tp->ca_ops->owner);
+}
+
+/* Used by sysctl to change default congestion control */
+int tcp_set_default_congestion_control(const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int ret = -ENOENT;
+
+	spin_lock(&tcp_cong_list_lock);
+	ca = tcp_ca_find(name);
+#ifdef CONFIG_KMOD
+	if (!ca) {
+		spin_unlock(&tcp_cong_list_lock);
+
+		request_module("tcp_%s", name);
+		spin_lock(&tcp_cong_list_lock);
+		ca = tcp_ca_find(name);
+	}
+#endif
+
+	if (ca) {
+		list_move(&ca->list, &tcp_cong_list);
+		ret = 0;
+	}
+	spin_unlock(&tcp_cong_list_lock);
+
+	return ret;
+}
+
+/* Get current default congestion control */
+void tcp_get_default_congestion_control(char *name)
+{
+	struct tcp_congestion_ops *ca;
+	/* We will always have reno... */
+	BUG_ON(list_empty(&tcp_cong_list));
+
+	rcu_read_lock();
+	ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list);
+	strncpy(name, ca->name, TCP_CA_NAME_MAX);
+	rcu_read_unlock();
+}
+
+/*
+ * TCP Reno congestion control
+ * This is special case used for fallback as well.
+ */
+/* This is Jacobson's slow start and congestion avoidance.
+ * SIGCOMM '88, p. 328.
+ */
+void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight,
+			 int flag)
+{
+	if (in_flight < tp->snd_cwnd)
+		return;
+
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt = 0;
+		} else
+			tp->snd_cwnd_cnt++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
+
+/* Slow start threshold is half the congestion window (min 2) */
+u32 tcp_reno_ssthresh(struct tcp_sock *tp)
+{
+	return max(tp->snd_cwnd >> 1U, 2U);
+}
+EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
+
+/* Lower bound on congestion window. */
+u32 tcp_reno_min_cwnd(struct tcp_sock *tp)
+{
+	return tp->snd_ssthresh/2;
+}
+EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd);
+
+struct tcp_congestion_ops tcp_reno = {
+	.name		= "reno",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+
+EXPORT_SYMBOL_GPL(tcp_reno);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 634befc07921..867acc0f79d8 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -42,7 +42,6 @@ struct tcpdiag_entry
 
 static struct sock *tcpnl;
 
-
 #define TCPDIAG_PUT(skb, attrtype, attrlen) \
 ({ int rtalen = RTA_LENGTH(attrlen);        \
    struct rtattr *rta;                      \
@@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	struct nlmsghdr  *nlh;
 	struct tcp_info  *info = NULL;
 	struct tcpdiag_meminfo  *minfo = NULL;
-	struct tcpvegas_info *vinfo = NULL;
 	unsigned char	 *b = skb->tail;
 
 	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
@@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
-		if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
-		    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
-			vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
@@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 	if (info) 
 		tcp_get_info(sk, info);
 
-	if (vinfo) {
-		if (tcp_is_vegas(tp)) {
-			vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
-			vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
-		} else {
-			vinfo->tcpv_enabled = 0;
-			vinfo->tcpv_rttcnt = 0;
-			vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
-			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
-		}
-	}
+	if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info)
+		tp->ca_ops->get_info(tp, ext, skb);
 
 	nlh->nlmsg_len = skb->tail - b;
 	return skb->len;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5bad504630a3..7bbbbc33eb4b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -61,7 +61,6 @@
  *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
  *					engine. Lots of bugs are found.
  *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
- *		Angelo Dell'Aera:	TCP Westwood+ support
  */
 
 #include <linux/config.h>
@@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337;
 int sysctl_tcp_max_orphans = NR_FILE;
 int sysctl_tcp_frto;
 int sysctl_tcp_nometrics_save;
-int sysctl_tcp_westwood;
-int sysctl_tcp_vegas_cong_avoid;
 
 int sysctl_tcp_moderate_rcvbuf = 1;
 
-/* Default values of the Vegas variables, in fixed-point representation
- * with V_PARAM_SHIFT bits to the right of the binary point.
- */
-#define V_PARAM_SHIFT 1
-int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
-int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
-int sysctl_tcp_bic = 1;
-int sysctl_tcp_bic_fast_convergence = 1;
-int sysctl_tcp_bic_low_window = 14;
-int sysctl_tcp_bic_beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
-
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk)
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-static void init_bictcp(struct tcp_sock *tp)
-{
-	tp->bictcp.cnt = 0;
-
-	tp->bictcp.last_max_cwnd = 0;
-	tp->bictcp.last_cwnd = 0;
-	tp->bictcp.last_stamp = 0;
-}
-
 /* 5. Recalculate window clamp after socket hit its memory bounds. */
 static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 {
@@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_
 		tcp_grow_window(sk, tp, skb);
 }
 
-/* When starting a new connection, pin down the current choice of 
- * congestion algorithm.
- */
-void tcp_ca_init(struct tcp_sock *tp)
-{
-	if (sysctl_tcp_westwood) 
-		tp->adv_cong = TCP_WESTWOOD;
-	else if (sysctl_tcp_bic)
-		tp->adv_cong = TCP_BIC;
-	else if (sysctl_tcp_vegas_cong_avoid) {
-		tp->adv_cong = TCP_VEGAS;
-		tp->vegas.baseRTT = 0x7fffffff;
-		tcp_vegas_enable(tp);
-	} 
-}
-
-/* Do RTT sampling needed for Vegas.
- * Basically we:
- *   o min-filter RTT samples from within an RTT to get the current
- *     propagation delay + queuing delay (we are min-filtering to try to
- *     avoid the effects of delayed ACKs)
- *   o min-filter RTT samples from a much longer window (forever for now)
- *     to find the propagation delay (baseRTT)
- */
-static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
-{
-	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
-
-	/* Filter to find propagation delay: */
-	if (vrtt < tp->vegas.baseRTT) 
-		tp->vegas.baseRTT = vrtt;
-
-	/* Find the min RTT during the last RTT to find
-	 * the current prop. delay + queuing delay:
-	 */
-	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
-	tp->vegas.cntRTT++;
-}
-
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
  * To save cycles in the RFC 1323 implementation it was better to break
  * it up into three procedures. -- erics
  */
-static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt)
 {
 	long m = mrtt; /* RTT */
 
-	if (tcp_vegas_enabled(tp))
-		vegas_rtt_calc(tp, mrtt);
-
 	/*	The following amusing code comes from Jacobson's
 	 *	article in SIGCOMM '88.  Note that rtt and mdev
 	 *	are scaled versions of rtt and mean deviation.
@@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
 		tp->rtt_seq = tp->snd_nxt;
 	}
 
-	tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+	if (tp->ca_ops->rtt_sample)
+		tp->ca_ops->rtt_sample(tp, *usrtt);
 }
 
 /* Calculate rto without backoff.  This is the second half of Van Jacobson's
@@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk)
             tp->snd_una == tp->high_seq ||
             (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		if (!tcp_westwood_ssthresh(tp))
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_FRTO);
 	}
 
 	/* Have to clear retransmission markers here to keep the bookkeeping
@@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk)
 	tcp_set_ca_state(tp, TCP_CA_Loss);
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
-
-	init_bictcp(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how)
 	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
 	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
 		tp->prior_ssthresh = tcp_current_ssthresh(tp);
-		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+		tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
+		tcp_ca_event(tp, CA_EVENT_LOSS);
 	}
 	tp->snd_cwnd	   = 1;
 	tp->snd_cwnd_cnt   = 0;
@@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
 }
 
 /* Decrease cwnd each second ack. */
-
 static void tcp_cwnd_down(struct tcp_sock *tp)
 {
 	int decr = tp->snd_cwnd_cnt + 1;
-	__u32 limit;
-
-	/*
-	 * TCP Westwood
-	 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
-	 * in packets we use mss_cache). If sysctl_tcp_westwood is off
-	 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
-	 * still used as usual. It prevents other strange cases in which
-	 * BWE*RTTmin could assume value 0. It should not happen but...
-	 */
-
-	if (!(limit = tcp_westwood_bw_rttmin(tp)))
-		limit = tp->snd_ssthresh/2;
 
 	tp->snd_cwnd_cnt = decr&1;
 	decr >>= 1;
 
-	if (decr && tp->snd_cwnd > limit)
+	if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp))
 		tp->snd_cwnd -= decr;
 
 	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
 static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
 {
 	if (tp->prior_ssthresh) {
-		if (tcp_is_bic(tp))
-			tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+		if (tp->ca_ops->undo_cwnd)
+			tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp);
 		else
 			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
 
@@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
 
 static inline void tcp_complete_cwr(struct tcp_sock *tp)
 {
-	if (tcp_westwood_cwnd(tp)) 
-		tp->snd_ssthresh = tp->snd_cwnd;
-	else
-		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR);
 }
 
 static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
@@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 		if (tp->ca_state < TCP_CA_CWR) {
 			if (!(flag&FLAG_ECE))
 				tp->prior_ssthresh = tcp_current_ssthresh(tp);
-			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+			tp->snd_ssthresh = tp->ca_ops->ssthresh(tp);
 			TCP_ECN_queue_cwr(tp);
 		}
 
@@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 /* Read draft-ietf-tcplw-high-performance before mucking
  * with this code. (Superceeds RFC1323)
  */
-static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag)
 {
 	__u32 seq_rtt;
 
@@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
 	 * in window is lost... Voila.	 			--ANK (010210)
 	 */
 	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
-static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag)
 {
 	/* We don't have a timestamp. Can only use
 	 * packets that are not retransmitted to determine
@@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
 	if (flag & FLAG_RETRANS_DATA_ACKED)
 		return;
 
-	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_rtt_estimator(tp, seq_rtt, usrtt);
 	tcp_set_rto(tp);
 	tp->backoff = 0;
 	tcp_bound_rto(tp);
 }
 
 static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
-				      int flag, s32 seq_rtt)
+				      int flag, s32 seq_rtt, u32 *usrtt)
 {
 	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
 	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
-		tcp_ack_saw_tstamp(tp, flag);
+		tcp_ack_saw_tstamp(tp, usrtt, flag);
 	else if (seq_rtt >= 0)
-		tcp_ack_no_tstamp(tp, seq_rtt, flag);
+		tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag);
 }
 
-/*
- * Compute congestion window to use.
- *
- * This is from the implementation of BICTCP in
- * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
- *  "Binary Increase Congestion Control for Fast, Long Distance
- *  Networks" in InfoComm 2004
- * Available from:
- *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
- *
- * Unless BIC is enabled and congestion window is large
- * this behaves the same as the original Reno.
- */
-static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
-{
-	/* orignal Reno behaviour */
-	if (!tcp_is_bic(tp))
-		return tp->snd_cwnd;
-
-	if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
-	   (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
-		return tp->bictcp.cnt;
-
-	tp->bictcp.last_cwnd = tp->snd_cwnd;
-	tp->bictcp.last_stamp = tcp_time_stamp;
-      
-	/* start off normal */
-	if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
-		tp->bictcp.cnt = tp->snd_cwnd;
-
-	/* binary increase */
-	else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
-		__u32 	dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
-			/ BICTCP_B;
-
-		if (dist > BICTCP_MAX_INCREMENT)
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-		else if (dist <= 1U)
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else
-			/* binary search increase */
-			tp->bictcp.cnt = tp->snd_cwnd / dist;
-	} else {
-		/* slow start amd linear increase */
-		if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
-				/ BICTCP_B;
-		else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
-			 		+ BICTCP_MAX_INCREMENT*(BICTCP_B-1))
-			/* slow start */
-			tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
-				/ (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
-		else
-			/* linear increase */
-			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
-	}
-	return tp->bictcp.cnt;
-}
-
-/* This is Jacobson's slow start and congestion avoidance. 
- * SIGCOMM '88, p. 328.
- */
-static inline void reno_cong_avoid(struct tcp_sock *tp)
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt,
+				  u32 in_flight, int good)
 {
-        if (tp->snd_cwnd <= tp->snd_ssthresh) {
-                /* In "safe" area, increase. */
-		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-			tp->snd_cwnd++;
-	} else {
-                /* In dangerous area, increase slowly.
-		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
-		 */
-		if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
-			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
-				tp->snd_cwnd++;
-			tp->snd_cwnd_cnt=0;
-		} else
-			tp->snd_cwnd_cnt++;
-        }
+	tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good);
 	tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
-/* This is based on the congestion detection/avoidance scheme described in
- *    Lawrence S. Brakmo and Larry L. Peterson.
- *    "TCP Vegas: End to end congestion avoidance on a global internet."
- *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
- *    October 1995. Available from:
- *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
- *
- * See http://www.cs.arizona.edu/xkernel/ for their implementation.
- * The main aspects that distinguish this implementation from the
- * Arizona Vegas implementation are:
- *   o We do not change the loss detection or recovery mechanisms of
- *     Linux in any way. Linux already recovers from losses quite well,
- *     using fine-grained timers, NewReno, and FACK.
- *   o To avoid the performance penalty imposed by increasing cwnd
- *     only every-other RTT during slow start, we increase during
- *     every RTT during slow start, just like Reno.
- *   o Largely to allow continuous cwnd growth during slow start,
- *     we use the rate at which ACKs come back as the "actual"
- *     rate, rather than the rate at which data is sent.
- *   o To speed convergence to the right rate, we set the cwnd
- *     to achieve the right ("actual") rate when we exit slow start.
- *   o To filter out the noise caused by delayed ACKs, we use the
- *     minimum RTT sample observed during the last RTT to calculate
- *     the actual rate.
- *   o When the sender re-starts from idle, it waits until it has
- *     received ACKs for an entire flight of new data before making
- *     a cwnd adjustment decision. The original Vegas implementation
- *     assumed senders never went idle.
- */
-static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
-	 *
-	 * These are so named because they represent the approximate values
-	 * of snd_una and snd_nxt at the beginning of the current RTT. More
-	 * precisely, they represent the amount of data sent during the RTT.
-	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
-	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
-	 * bytes of data have been ACKed during the course of the RTT, giving
-	 * an "actual" rate of:
-	 *
-	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
-	 *
-	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
-	 * because delayed ACKs can cover more than one segment, so they
-	 * don't line up nicely with the boundaries of RTTs.
-	 *
-	 * Another unfortunate fact of life is that delayed ACKs delay the
-	 * advance of the left edge of our send window, so that the number
-	 * of bytes we send in an RTT is often less than our cwnd will allow.
-	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
-	 */
-
-	if (after(ack, tp->vegas.beg_snd_nxt)) {
-		/* Do the Vegas once-per-RTT cwnd adjustment. */
-		u32 old_wnd, old_snd_cwnd;
-
-		
-		/* Here old_wnd is essentially the window of data that was
-		 * sent during the previous RTT, and has all
-		 * been acknowledged in the course of the RTT that ended
-		 * with the ACK we just received. Likewise, old_snd_cwnd
-		 * is the cwnd during the previous RTT.
-		 */
-		old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
-			tp->mss_cache_std;
-		old_snd_cwnd = tp->vegas.beg_snd_cwnd;
-
-		/* Save the extent of the current window so we can use this
-		 * at the end of the next RTT.
-		 */
-		tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
-		tp->vegas.beg_snd_nxt  = tp->snd_nxt;
-		tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
-
-		/* Take into account the current RTT sample too, to
-		 * decrease the impact of delayed acks. This double counts
-		 * this sample since we count it for the next window as well,
-		 * but that's not too awful, since we're taking the min,
-		 * rather than averaging.
-		 */
-		vegas_rtt_calc(tp, seq_rtt);
-
-		/* We do the Vegas calculations only if we got enough RTT
-		 * samples that we can be reasonably sure that we got
-		 * at least one RTT sample that wasn't from a delayed ACK.
-		 * If we only had 2 samples total,
-		 * then that means we're getting only 1 ACK per RTT, which
-		 * means they're almost certainly delayed ACKs.
-		 * If  we have 3 samples, we should be OK.
-		 */
-
-		if (tp->vegas.cntRTT <= 2) {
-			/* We don't have enough RTT samples to do the Vegas
-			 * calculation, so we'll behave like Reno.
-			 */
-			if (tp->snd_cwnd > tp->snd_ssthresh)
-				tp->snd_cwnd++;
-		} else {
-			u32 rtt, target_cwnd, diff;
-
-			/* We have enough RTT samples, so, using the Vegas
-			 * algorithm, we determine if we should increase or
-			 * decrease cwnd, and by how much.
-			 */
-
-			/* Pluck out the RTT we are using for the Vegas
-			 * calculations. This is the min RTT seen during the
-			 * last RTT. Taking the min filters out the effects
-			 * of delayed ACKs, at the cost of noticing congestion
-			 * a bit later.
-			 */
-			rtt = tp->vegas.minRTT;
-
-			/* Calculate the cwnd we should have, if we weren't
-			 * going too fast.
-			 *
-			 * This is:
-			 *     (actual rate in segments) * baseRTT
-			 * We keep it as a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary point.
-			 */
-			target_cwnd = ((old_wnd * tp->vegas.baseRTT)
-				       << V_PARAM_SHIFT) / rtt;
-
-			/* Calculate the difference between the window we had,
-			 * and the window we would like to have. This quantity
-			 * is the "Diff" from the Arizona Vegas papers.
-			 *
-			 * Again, this is a fixed point number with
-			 * V_PARAM_SHIFT bits to the right of the binary
-			 * point.
-			 */
-			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
-
-			if (tp->snd_cwnd < tp->snd_ssthresh) {
-				/* Slow start.  */
-				if (diff > sysctl_tcp_vegas_gamma) {
-					/* Going too fast. Time to slow down
-					 * and switch to congestion avoidance.
-					 */
-					tp->snd_ssthresh = 2;
-
-					/* Set cwnd to match the actual rate
-					 * exactly:
-					 *   cwnd = (actual rate) * baseRTT
-					 * Then we add 1 because the integer
-					 * truncation robs us of full link
-					 * utilization.
-					 */
-					tp->snd_cwnd = min(tp->snd_cwnd,
-							   (target_cwnd >>
-							    V_PARAM_SHIFT)+1);
-
-				}
-			} else {
-				/* Congestion avoidance. */
-				u32 next_snd_cwnd;
-
-				/* Figure out where we would like cwnd
-				 * to be.
-				 */
-				if (diff > sysctl_tcp_vegas_beta) {
-					/* The old window was too fast, so
-					 * we slow down.
-					 */
-					next_snd_cwnd = old_snd_cwnd - 1;
-				} else if (diff < sysctl_tcp_vegas_alpha) {
-					/* We don't have enough extra packets
-					 * in the network, so speed up.
-					 */
-					next_snd_cwnd = old_snd_cwnd + 1;
-				} else {
-					/* Sending just as fast as we
-					 * should be.
-					 */
-					next_snd_cwnd = old_snd_cwnd;
-				}
-
-				/* Adjust cwnd upward or downward, toward the
-				 * desired value.
-				 */
-				if (next_snd_cwnd > tp->snd_cwnd)
-					tp->snd_cwnd++;
-				else if (next_snd_cwnd < tp->snd_cwnd)
-					tp->snd_cwnd--;
-			}
-		}
-
-		/* Wipe the slate clean for the next RTT. */
-		tp->vegas.cntRTT = 0;
-		tp->vegas.minRTT = 0x7fffffff;
-	}
-
-	/* The following code is executed for every ack we receive,
-	 * except for conditions checked in should_advance_cwnd()
-	 * before the call to tcp_cong_avoid(). Mainly this means that
-	 * we only execute this code if the ack actually acked some
-	 * data.
-	 */
-
-	/* If we are in slow start, increase our cwnd in response to this ACK.
-	 * (If we are not in slow start then we are in congestion avoidance,
-	 * and adjust our congestion window only once per RTT. See the code
-	 * above.)
-	 */
-	if (tp->snd_cwnd <= tp->snd_ssthresh) 
-		tp->snd_cwnd++;
-
-	/* to keep cwnd from growing without bound */
-	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
-
-	/* Make sure that we are never so timid as to reduce our cwnd below
-	 * 2 MSS.
-	 *
-	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
-	 */
-	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
-
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-}
-
-static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
-{
-	if (tcp_vegas_enabled(tp))
-		vegas_cong_avoid(tp, ack, seq_rtt);
-	else
-		reno_cong_avoid(tp);
-}
-
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
@@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
 
 
 /* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	__u32 now = tcp_time_stamp;
 	int acked = 0;
 	__s32 seq_rtt = -1;
+	struct timeval usnow;
+	u32 pkts_acked = 0;
+
+	if (seq_usrtt)
+		do_gettimeofday(&usnow);
 
 	while ((skb = skb_peek(&sk->sk_write_queue)) &&
 	       skb != sk->sk_send_head) {
@@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 		 */
 		if (!(scb->flags & TCPCB_FLAG_SYN)) {
 			acked |= FLAG_DATA_ACKED;
+			++pkts_acked;
 		} else {
 			acked |= FLAG_SYN_ACKED;
 			tp->retrans_stamp = 0;
@@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 				seq_rtt = -1;
 			} else if (seq_rtt < 0)
 				seq_rtt = now - scb->when;
+			if (seq_usrtt)
+				*seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000
+					+ (usnow.tv_usec - skb->stamp.tv_usec);
+
 			if (sacked & TCPCB_SACKED_ACKED)
 				tp->sacked_out -= tcp_skb_pcount(skb);
 			if (sacked & TCPCB_LOST)
@@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	}
 
 	if (acked&FLAG_ACKED) {
-		tcp_ack_update_rtt(tp, acked, seq_rtt);
+		tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt);
 		tcp_ack_packets_out(sk, tp);
+
+		if (tp->ca_ops->pkts_acked)
+			tp->ca_ops->pkts_acked(tp, pkts_acked);
 	}
 
 #if FASTRETRANS_DEBUG > 0
@@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
 	tp->frto_counter = (tp->frto_counter + 1) % 3;
 }
 
-/*
- * TCP Westwood+
- */
-
-/*
- * @init_westwood
- * This function initializes fields used in TCP Westwood+. We can't
- * get no information about RTTmin at this time so we simply set it to
- * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
- * since in this way we're sure it will be updated in a consistent
- * way as soon as possible. It will reasonably happen within the first
- * RTT period of the connection lifetime.
- */
-
-static void init_westwood(struct sock *sk)
-{
-        struct tcp_sock *tp = tcp_sk(sk);
-
-        tp->westwood.bw_ns_est = 0;
-        tp->westwood.bw_est = 0;
-        tp->westwood.accounted = 0;
-        tp->westwood.cumul_ack = 0;
-        tp->westwood.rtt_win_sx = tcp_time_stamp;
-        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
-        tp->westwood.snd_una = tp->snd_una;
-}
-
-/*
- * @westwood_do_filter
- * Low-pass filter. Implemented using constant coeffients.
- */
-
-static inline __u32 westwood_do_filter(__u32 a, __u32 b)
-{
-	return (((7 * a) + b) >> 3);
-}
-
-static void westwood_filter(struct sock *sk, __u32 delta)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.bw_ns_est =
-		westwood_do_filter(tp->westwood.bw_ns_est, 
-				   tp->westwood.bk / delta);
-	tp->westwood.bw_est =
-		westwood_do_filter(tp->westwood.bw_est,
-				   tp->westwood.bw_ns_est);
-}
-
-/* 
- * @westwood_update_rttmin
- * It is used to update RTTmin. In this case we MUST NOT use
- * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
- */
-
-static inline __u32 westwood_update_rttmin(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 rttmin = tp->westwood.rtt_min;
-
-	if (tp->westwood.rtt != 0 &&
-	    (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
-		rttmin = tp->westwood.rtt;
-
-	return rttmin;
-}
-
-/*
- * @westwood_acked
- * Evaluate increases for dk. 
- */
-
-static inline __u32 westwood_acked(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-
-	return tp->snd_una - tp->westwood.snd_una;
-}
-
-/*
- * @westwood_new_window
- * It evaluates if we are receiving data inside the same RTT window as
- * when we started.
- * Return value:
- * It returns 0 if we are still evaluating samples in the same RTT
- * window, 1 if the sample has to be considered in the next window.
- */
-
-static int westwood_new_window(const struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	__u32 left_bound;
-	__u32 rtt;
-	int ret = 0;
-
-	left_bound = tp->westwood.rtt_win_sx;
-	rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
-
-	/*
-	 * A RTT-window has passed. Be careful since if RTT is less than
-	 * 50ms we don't filter but we continue 'building the sample'.
-	 * This minimum limit was choosen since an estimation on small
-	 * time intervals is better to avoid...
-	 * Obvioulsy on a LAN we reasonably will always have
-	 * right_bound = left_bound + WESTWOOD_RTT_MIN
-         */
-
-	if ((left_bound + rtt) < tcp_time_stamp)
-		ret = 1;
-
-	return ret;
-}
-
-/*
- * @westwood_update_window
- * It updates RTT evaluation window if it is the right moment to do
- * it. If so it calls filter for evaluating bandwidth. 
- */
-
-static void __westwood_update_window(struct sock *sk, __u32 now)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	__u32 delta = now - tp->westwood.rtt_win_sx;
-
-        if (delta) {
-		if (tp->westwood.rtt)
-			westwood_filter(sk, delta);
-
-		tp->westwood.bk = 0;
-		tp->westwood.rtt_win_sx = tcp_time_stamp;
-	}
-}
-
-
-static void westwood_update_window(struct sock *sk, __u32 now)
-{
-	if (westwood_new_window(sk)) 
-		__westwood_update_window(sk, now);
-}
-
-/*
- * @__tcp_westwood_fast_bw
- * It is called when we are in fast path. In particular it is called when
- * header prediction is successfull. In such case infact update is
- * straight forward and doesn't need any particular care.
- */
-
-static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked(sk);
-	tp->westwood.snd_una = tp->snd_una;
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_fast_bw(sk, skb);
-}
-
-
-/*
- * @westwood_dupack_update
- * It updates accounted and cumul_ack when receiving a dupack.
- */
-
-static void westwood_dupack_update(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.accounted += tp->mss_cache_std;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline int westwood_may_change_cumul(struct tcp_sock *tp)
-{
-	return (tp->westwood.cumul_ack > tp->mss_cache_std);
-}
-
-static inline void westwood_partial_update(struct tcp_sock *tp)
-{
-	tp->westwood.accounted -= tp->westwood.cumul_ack;
-	tp->westwood.cumul_ack = tp->mss_cache_std;
-}
-
-static inline void westwood_complete_update(struct tcp_sock *tp)
-{
-	tp->westwood.cumul_ack -= tp->westwood.accounted;
-	tp->westwood.accounted = 0;
-}
-
-/*
- * @westwood_acked_count
- * This function evaluates cumul_ack for evaluating dk in case of
- * delayed or partial acks.
- */
-
-static inline __u32 westwood_acked_count(struct sock *sk)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tp->westwood.cumul_ack = westwood_acked(sk);
-
-        /* If cumul_ack is 0 this is a dupack since it's not moving
-         * tp->snd_una.
-         */
-        if (!(tp->westwood.cumul_ack))
-                westwood_dupack_update(sk);
-
-        if (westwood_may_change_cumul(tp)) {
-		/* Partial or delayed ack */
-		if (tp->westwood.accounted >= tp->westwood.cumul_ack)
-			westwood_partial_update(tp);
-		else
-			westwood_complete_update(tp);
-	}
-
-	tp->westwood.snd_una = tp->snd_una;
-
-	return tp->westwood.cumul_ack;
-}
-
-
-/*
- * @__tcp_westwood_slow_bw
- * It is called when something is going wrong..even if there could
- * be no problems! Infact a simple delayed packet may trigger a
- * dupack. But we need to be careful in such case.
- */
-
-static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	westwood_update_window(sk, tcp_time_stamp);
-
-	tp->westwood.bk += westwood_acked_count(sk);
-	tp->westwood.rtt_min = westwood_update_rttmin(sk);
-}
-
-static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
-{
-        if (tcp_is_westwood(tcp_sk(sk)))
-                __tcp_westwood_slow_bw(sk, skb);
-}
-
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 {
@@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	u32 prior_in_flight;
 	s32 seq_rtt;
+	s32 seq_usrtt = 0;
 	int prior_packets;
 
 	/* If the ack is newer than sent or older than previous acks
@@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		 */
 		tcp_update_wl(tp, ack, ack_seq);
 		tp->snd_una = ack;
-		tcp_westwood_fast_bw(sk, skb);
 		flag |= FLAG_WIN_UPDATE;
 
+		tcp_ca_event(tp, CA_EVENT_FAST_ACK);
+
 		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
 	} else {
 		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
@@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
 			flag |= FLAG_ECE;
 
-		tcp_westwood_slow_bw(sk,skb);
+		tcp_ca_event(tp, CA_EVENT_SLOW_ACK);
 	}
 
 	/* We passed data and got it acked, remove any soft error
@@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
 	prior_in_flight = tcp_packets_in_flight(tp);
 
 	/* See if we can take anything off of the retransmit queue. */
-	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+	flag |= tcp_clean_rtx_queue(sk, &seq_rtt,
+				    tp->ca_ops->rtt_sample ? &seq_usrtt : NULL);
 
 	if (tp->frto_counter)
 		tcp_process_frto(sk, prior_snd_una);
 
 	if (tcp_ack_is_dubious(tp, flag)) {
 		/* Advanve CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) &&
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
-		    tcp_may_raise_cwnd(tp, flag))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag))
+			tcp_cong_avoid(tp, ack,  seq_rtt, prior_in_flight, 0);
 		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED) && 
-		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
-			tcp_cong_avoid(tp, ack, seq_rtt);
+		if ((flag & FLAG_DATA_ACKED))
+			tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1);
 	}
 
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
@@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_init_metrics(sk);
 
+		tcp_init_congestion_control(tp);
+
 		/* Prevent spurious tcp_cwnd_restart() on first data
 		 * packet.
 		 */
@@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 			if(tp->af_specific->conn_request(sk, skb) < 0)
 				return 1;
 
-			init_westwood(sk);
-			init_bictcp(tp);
-
 			/* Now we have several options: In theory there is 
 			 * nothing else in the frame. KA9Q has an option to 
 			 * send data with the syn, BSD accepts data with the
@@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 		goto discard;
 
 	case TCP_SYN_SENT:
-		init_westwood(sk);
-		init_bictcp(tp);
-
 		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
 		if (queued >= 0)
 			return queued;
@@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
 				    !tp->srtt)
-					tcp_ack_saw_tstamp(tp, 0);
+					tcp_ack_saw_tstamp(tp, 0, 0);
 
 				if (tp->rx_opt.tstamp_ok)
 					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
@@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 				tcp_init_metrics(sk);
 
+				tcp_init_congestion_control(tp);
+
 				/* Prevent spurious tcp_cwnd_restart() on
 				 * first data packet.
 				 */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2d41d5d6ad19..9122814c13ad 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
+	tp->ca_ops = &tcp_reno;
 
 	sk->sk_state = TCP_CLOSE;
 
@@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk)
 
 	tcp_clear_xmit_timers(sk);
 
+	tcp_cleanup_congestion_control(tp);
+
 	/* Cleanup up the write buffer. */
   	sk_stream_writequeue_purge(sk);
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b3943e7562f3..f42a284164b7 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->frto_counter = 0;
 		newtp->frto_highmark = 0;
 
+		newtp->ca_ops = &tcp_reno;
+
 		tcp_set_ca_state(newtp, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
@@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		if (newtp->ecn_flags&TCP_ECN_OK)
 			sock_set_flag(newsk, SOCK_NO_LARGESEND);
 
-		tcp_ca_init(newtp);
-
 		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f17c6577e337..0e17c244875c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
 	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
 	u32 cwnd = tp->snd_cwnd;
 
-	if (tcp_is_vegas(tp)) 
-		tcp_vegas_enable(tp);
+	tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
 
 	tp->snd_ssthresh = tcp_current_ssthresh(tp);
 	restart_cwnd = min(restart_cwnd, cwnd);
@@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 #define SYSCTL_FLAG_WSCALE	0x2
 #define SYSCTL_FLAG_SACK	0x4
 
+		/* If congestion control is doing timestamping */
+		if (tp->ca_ops->rtt_sample)
+			do_gettimeofday(&skb->stamp);
+
 		sysctl_flags = 0;
 		if (tcb->flags & TCPCB_FLAG_SYN) {
 			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
@@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
 					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 		}
 		
-		/*
-		 * If the connection is idle and we are restarting,
-		 * then we don't want to do any Vegas calculations
-		 * until we get fresh RTT samples.  So when we
-		 * restart, we reset our Vegas state to a clean
-		 * slate. After we get acks for this flight of
-		 * packets, _then_ we can make Vegas calculations
-		 * again.
-		 */
-		if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
-			tcp_vegas_enable(tp);
+		if (tcp_packets_in_flight(tp) == 0)
+			tcp_ca_event(tp, CA_EVENT_TX_START);
 
 		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 		skb->h.th = th;
@@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
 	 * skbs, which it never sent before. --ANK
 	 */
 	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+	buff->stamp = skb->stamp;
 
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
 		tp->lost_out -= tcp_skb_pcount(skb);
@@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk)
 		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
 	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
 	tcp_initialize_rcv_mss(sk);
-	tcp_ca_init(tp);
 
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
@@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk)
 	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
 	tp->snd_nxt = tp->write_seq;
 	tp->pushed_seq = tp->write_seq;
-	tcp_ca_init(tp);
 
 	/* Send it off. */
 	TCP_SKB_CB(buff)->when = tcp_time_stamp;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2414937f2a83..fce56039b0e9 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-
+	tp->ca_ops = &tcp_reno;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit v1.2.3-59-g8ed1b


From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 12:21:28 -0700
Subject: [TCP]: Report congestion control algorithm in tcp_diag.

Enhancement to the tcp_diag interface used by the iproute2 ss command
to report the tcp congestion control being used by a socket.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp_diag.h | 4 ++--
 net/ipv4/tcp_diag.c      | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h
index ceee962e1d15..7a5996743946 100644
--- a/include/linux/tcp_diag.h
+++ b/include/linux/tcp_diag.h
@@ -99,9 +99,10 @@ enum
 	TCPDIAG_MEMINFO,
 	TCPDIAG_INFO,
 	TCPDIAG_VEGASINFO,
+	TCPDIAG_CONG,
 };
 
-#define TCPDIAG_MAX TCPDIAG_VEGASINFO
+#define TCPDIAG_MAX TCPDIAG_CONG
 
 
 /* TCPDIAG_MEM */
@@ -123,5 +124,4 @@ struct tcpvegas_info {
 	__u32	tcpv_minrtt;
 };
 
-
 #endif /* _TCP_DIAG_H_ */
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index a4e512036d88..f66945cb158f 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -65,6 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
 		if (ext & (1<<(TCPDIAG_INFO-1)))
 			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
 		
+		if (ext & (1<<(TCPDIAG_CONG-1))) {
+			size_t len = strlen(tp->ca_ops->name);
+			strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1),
+			       tp->ca_ops->name);
+		}
 	}
 	r->tcpdiag_family = sk->sk_family;
 	r->tcpdiag_state = sk->sk_state;
-- 
cgit v1.2.3-59-g8ed1b


From c1ebcdb8c422cd73f54bcd2b9953e443a47667e5 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:08:59 -0700
Subject: [NET]: Remove obsolete fastroute stats.

Remove last vestiages of fastroute code that is no longer used.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h  |  5 -----
 net/core/dev.c             | 10 ++--------
 net/core/sysctl_net_core.c |  1 -
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d89816ad642f..c2e15e381a58 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -165,11 +165,6 @@ struct netif_rx_stats
 	unsigned dropped;
 	unsigned time_squeeze;
 	unsigned throttled;
-	unsigned fastroute_hit;
-	unsigned fastroute_success;
-	unsigned fastroute_defer;
-	unsigned fastroute_deferred_out;
-	unsigned fastroute_latency_reduction;
 	unsigned cpu_collision;
 };
 
diff --git a/net/core/dev.c b/net/core/dev.c
index ab935778ce81..4f1ae2efe872 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2056,14 +2056,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   s->total, s->dropped, s->time_squeeze, s->throttled,
-		   s->fastroute_hit, s->fastroute_success, s->fastroute_defer,
-		   s->fastroute_deferred_out,
-#if 0
-		   s->fastroute_latency_reduction
-#else
-		   s->cpu_collision
-#endif
-		  );
+		   0, 0, 0, 0, /* was fastroute */
+		   s->cpu_collision );
 	return 0;
 }
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 880a88815211..76e9987474ca 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -18,7 +18,6 @@ extern int no_cong_thresh;
 extern int no_cong;
 extern int lo_cong;
 extern int mod_cong;
-extern int netdev_fastroute;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
-- 
cgit v1.2.3-59-g8ed1b


From 34008d8c631d067caffa136313260525f3ae48a2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:10:00 -0700
Subject: [NET]: Remove obsolete netif_rx congestion sensing mechanism.

Remove the congestion sensing mechanism from netif_rx, and always
return either full or empty.  Almost no driver checks the return value
from netif_rx, and those that do only use it for debug messages.

The original design of netif_rx was to do flow control based on the
receive queue, but NAPI has supplanted this and no driver uses the
feedback.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h  |  2 --
 net/core/dev.c             | 88 +---------------------------------------------
 net/core/sysctl_net_core.c | 36 -------------------
 3 files changed, 1 insertion(+), 125 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c2e15e381a58..718ad579c65c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -558,8 +558,6 @@ static inline int unregister_gifconf(unsigned int family)
 struct softnet_data
 {
 	int			throttle;
-	int			cng_level;
-	int			avg_blog;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
 	struct net_device	*output_queue;
diff --git a/net/core/dev.c b/net/core/dev.c
index 4f1ae2efe872..3156df699f01 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -115,18 +115,6 @@
 #endif	/* CONFIG_NET_RADIO */
 #include <asm/current.h>
 
-/* This define, if set, will randomly drop a packet when congestion
- * is more than moderate.  It helps fairness in the multi-interface
- * case when one of them is a hog, but it kills performance for the
- * single interface case so it is off now by default.
- */
-#undef RAND_LIE
-
-/* Setting this will sample the queue lengths and thus congestion
- * via a timer instead of as each packet is received.
- */
-#undef OFFLINE_SAMPLE
-
 /*
  *	The list of packet types we will receive (as opposed to discard)
  *	and the routines to invoke.
@@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock);
 static struct list_head ptype_base[16];	/* 16 way hashed list */
 static struct list_head ptype_all;		/* Taps */
 
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy);
-static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0);
-#endif
-
 /*
  * The @dev_base list is protected by @dev_base_lock and the rtln
  * semaphore.
@@ -1365,69 +1348,10 @@ out:
 
 int netdev_max_backlog = 300;
 int weight_p = 64;            /* old backlog weight */
-/* These numbers are selected based on intuition and some
- * experimentatiom, if you have more scientific way of doing this
- * please go ahead and fix things.
- */
-int no_cong_thresh = 10;
-int no_cong = 20;
-int lo_cong = 100;
-int mod_cong = 290;
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 
-static void get_sample_stats(int cpu)
-{
-#ifdef RAND_LIE
-	unsigned long rd;
-	int rq;
-#endif
-	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
-	int blog = sd->input_pkt_queue.qlen;
-	int avg_blog = sd->avg_blog;
-
-	avg_blog = (avg_blog >> 1) + (blog >> 1);
-
-	if (avg_blog > mod_cong) {
-		/* Above moderate congestion levels. */
-		sd->cng_level = NET_RX_CN_HIGH;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-		if (rq < avg_blog) /* unlucky bastard */
-			sd->cng_level = NET_RX_DROP;
-#endif
-	} else if (avg_blog > lo_cong) {
-		sd->cng_level = NET_RX_CN_MOD;
-#ifdef RAND_LIE
-		rd = net_random();
-		rq = rd % netdev_max_backlog;
-			if (rq < avg_blog) /* unlucky bastard */
-				sd->cng_level = NET_RX_CN_HIGH;
-#endif
-	} else if (avg_blog > no_cong)
-		sd->cng_level = NET_RX_CN_LOW;
-	else  /* no congestion */
-		sd->cng_level = NET_RX_SUCCESS;
-
-	sd->avg_blog = avg_blog;
-}
-
-#ifdef OFFLINE_SAMPLE
-static void sample_queue(unsigned long dummy)
-{
-/* 10 ms 0r 1ms -- i don't care -- JHS */
-	int next_tick = 1;
-	int cpu = smp_processor_id();
-
-	get_sample_stats(cpu);
-	next_tick += jiffies;
-	mod_timer(&samp_timer, next_tick);
-}
-#endif
-
-
 /**
  *	netif_rx	-	post buffer to the network code
  *	@skb: buffer to post
@@ -1476,11 +1400,8 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
-#ifndef OFFLINE_SAMPLE
-			get_sample_stats(this_cpu);
-#endif
 			local_irq_restore(flags);
-			return queue->cng_level;
+			return NET_RX_SUCCESS;
 		}
 
 		if (queue->throttle)
@@ -3300,8 +3221,6 @@ static int __init net_dev_init(void)
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
 		queue->throttle = 0;
-		queue->cng_level = 0;
-		queue->avg_blog = 10; /* arbitrary non-zero */
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
@@ -3310,11 +3229,6 @@ static int __init net_dev_init(void)
 		atomic_set(&queue->backlog_dev.refcnt, 1);
 	}
 
-#ifdef OFFLINE_SAMPLE
-	samp_timer.expires = jiffies + (10 * HZ);
-	add_timer(&samp_timer);
-#endif
-
 	dev_boot_phase = 0;
 
 	open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 76e9987474ca..fff63643a35c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -14,10 +14,6 @@
 
 extern int netdev_max_backlog;
 extern int weight_p;
-extern int no_cong_thresh;
-extern int no_cong;
-extern int lo_cong;
-extern int mod_cong;
 extern int net_msg_cost;
 extern int net_msg_burst;
 
@@ -84,38 +80,6 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG_THRESH,
-		.procname	= "no_cong_thresh",
-		.data		= &no_cong_thresh,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_NO_CONG,
-		.procname	= "no_cong",
-		.data		= &no_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_LO_CONG,
-		.procname	= "lo_cong",
-		.data		= &lo_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
-	{
-		.ctl_name	= NET_CORE_MOD_CONG,
-		.procname	= "mod_cong",
-		.data		= &mod_cong,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec
-	},
 	{
 		.ctl_name	= NET_CORE_MSG_COST,
 		.procname	= "message_cost",
-- 
cgit v1.2.3-59-g8ed1b


From 31aa02c53c84658f6694f319f09e232ede27be5a Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:12:48 -0700
Subject: [NET]: Eliminate netif_rx massive packet drops.

Eliminate the throttling behaviour when the netif receive queue fills
because it behaves badly when using high speed networks under load.
The throttling cause multiple packet drops that cause TCP to go into
slow start mode. The same effective patch has been part of BIC TCP and
H-TCP as well as part of Web100.

The existing code drops 100's of packets when the queue fills;
this changes it to individual packet drop-tail.

Signed-off-by: Stephen Hemmminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  4 +---
 net/core/dev.c            | 21 ++-------------------
 2 files changed, 3 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 718ad579c65c..3a0ed7f9e801 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -164,7 +164,6 @@ struct netif_rx_stats
 	unsigned total;
 	unsigned dropped;
 	unsigned time_squeeze;
-	unsigned throttled;
 	unsigned cpu_collision;
 };
 
@@ -557,10 +556,9 @@ static inline int unregister_gifconf(unsigned int family)
 
 struct softnet_data
 {
-	int			throttle;
+	struct net_device	*output_queue;
 	struct sk_buff_head	input_pkt_queue;
 	struct list_head	poll_list;
-	struct net_device	*output_queue;
 	struct sk_buff		*completion_queue;
 
 	struct net_device	backlog_dev;	/* Sorry. 8) */
diff --git a/net/core/dev.c b/net/core/dev.c
index 3156df699f01..1a64508e527f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -198,7 +198,7 @@ static struct notifier_block *netdev_chain;
  *	Device drivers call our routines to queue packets here. We empty the
  *	queue in the local softnet handler.
  */
-DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, };
+DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 
 #ifdef CONFIG_SYSFS
 extern int netdev_sysfs_init(void);
@@ -1372,7 +1372,6 @@ DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
 
 int netif_rx(struct sk_buff *skb)
 {
-	int this_cpu;
 	struct softnet_data *queue;
 	unsigned long flags;
 
@@ -1388,15 +1387,11 @@ int netif_rx(struct sk_buff *skb)
 	 * short when CPU is congested, but is still operating.
 	 */
 	local_irq_save(flags);
-	this_cpu = smp_processor_id();
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
 		if (queue->input_pkt_queue.qlen) {
-			if (queue->throttle)
-				goto drop;
-
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
@@ -1404,19 +1399,10 @@ enqueue:
 			return NET_RX_SUCCESS;
 		}
 
-		if (queue->throttle)
-			queue->throttle = 0;
-
 		netif_rx_schedule(&queue->backlog_dev);
 		goto enqueue;
 	}
 
-	if (!queue->throttle) {
-		queue->throttle = 1;
-		__get_cpu_var(netdev_rx_stat).throttled++;
-	}
-
-drop:
 	__get_cpu_var(netdev_rx_stat).dropped++;
 	local_irq_restore(flags);
 
@@ -1701,8 +1687,6 @@ job_done:
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	if (queue->throttle)
-		queue->throttle = 0;
 	local_irq_enable();
 	return 0;
 }
@@ -1976,7 +1960,7 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
 	struct netif_rx_stats *s = v;
 
 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
-		   s->total, s->dropped, s->time_squeeze, s->throttled,
+		   s->total, s->dropped, s->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
 		   s->cpu_collision );
 	return 0;
@@ -3220,7 +3204,6 @@ static int __init net_dev_init(void)
 
 		queue = &per_cpu(softnet_data, i);
 		skb_queue_head_init(&queue->input_pkt_queue);
-		queue->throttle = 0;
 		queue->completion_queue = NULL;
 		INIT_LIST_HEAD(&queue->poll_list);
 		set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
-- 
cgit v1.2.3-59-g8ed1b


From 51b0bdedb8e784d0d969a6b77151911130812400 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:14:40 -0700
Subject: [NET]: Separate two usages of netdev_max_backlog.

Separate out the two uses of netdev_max_backlog. One controls the
upper bound on packets processed per softirq, the new name for this is
netdev_budget; the other controls the limit on packets queued via
netif_rx.

Increase the max_backlog default to account for faster processors.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h     | 1 +
 net/core/dev.c             | 6 +++---
 net/core/sysctl_net_core.c | 9 +++++++++
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 72965bfe6cfb..ebfe1250f0a4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -243,6 +243,7 @@ enum
 	NET_CORE_MOD_CONG=16,
 	NET_CORE_DEV_WEIGHT=17,
 	NET_CORE_SOMAXCONN=18,
+	NET_CORE_BUDGET=19,
 };
 
 /* /proc/sys/net/ethernet */
diff --git a/net/core/dev.c b/net/core/dev.c
index 1a64508e527f..7016e0c36b3d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1346,7 +1346,8 @@ out:
 			Receiver routines
   =======================================================================*/
 
-int netdev_max_backlog = 300;
+int netdev_max_backlog = 1000;
+int netdev_budget = 300;
 int weight_p = 64;            /* old backlog weight */
 
 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
@@ -1695,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h)
 {
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
-	int budget = netdev_max_backlog;
-
+	int budget = netdev_budget;
 	
 	local_irq_disable();
 
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index fff63643a35c..8f817ad9f546 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -13,6 +13,7 @@
 #ifdef CONFIG_SYSCTL
 
 extern int netdev_max_backlog;
+extern int netdev_budget;
 extern int weight_p;
 extern int net_msg_cost;
 extern int net_msg_burst;
@@ -124,6 +125,14 @@ ctl_table core_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_CORE_BUDGET,
+		.procname	= "netdev_budget",
+		.data		= &netdev_budget,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 5f8ef48d240963093451bcf83df89f1a1364f51d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 23 Jun 2005 20:37:36 -0700
Subject: [TCP]: Allow choosing TCP congestion control via sockopt.

Allow using setsockopt to set TCP congestion control to use on a per
socket basis.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h |  1 +
 include/net/tcp.h   |  3 ++-
 net/ipv4/tcp.c      | 31 ++++++++++++++++++++++++++++++-
 net/ipv4/tcp_cong.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c |  2 +-
 net/ipv6/tcp_ipv6.c |  2 +-
 6 files changed, 79 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3ea75dd6640a..dfd93d03f5d2 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -127,6 +127,7 @@ enum {
 #define TCP_WINDOW_CLAMP	10	/* Bound advertised window */
 #define TCP_INFO		11	/* Information about this connection. */
 #define TCP_QUICKACK		12	/* Block/reenable quick acks */
+#define TCP_CONGESTION		13	/* Congestion control algorithm */
 
 #define TCPI_OPT_TIMESTAMPS	1
 #define TCPI_OPT_SACK		2
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e427cf35915c..d04b21188ccb 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1162,8 +1162,9 @@ extern void tcp_init_congestion_control(struct tcp_sock *tp);
 extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
 extern int tcp_set_default_congestion_control(const char *name);
 extern void tcp_get_default_congestion_control(char *name);
+extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name);
 
-extern struct tcp_congestion_ops tcp_reno;
+extern struct tcp_congestion_ops tcp_init_congestion_ops;
 extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
 extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
 				u32 rtt, u32 in_flight, int flag);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f3dbc8dc1263..882436da9a3a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
 		return tp->af_specific->setsockopt(sk, level, optname,
 						   optval, optlen);
 
+	/* This is a string value all the others are int's */
+	if (optname == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+
+		if (optlen < 1)
+			return -EINVAL;
+
+		val = strncpy_from_user(name, optval,
+					min(TCP_CA_NAME_MAX-1, optlen));
+		if (val < 0)
+			return -EFAULT;
+		name[val] = 0;
+
+		lock_sock(sk);
+		err = tcp_set_congestion_control(tp, name);
+		release_sock(sk);
+		return err;
+	}
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 	case TCP_QUICKACK:
 		val = !tp->ack.pingpong;
 		break;
+
+	case TCP_CONGESTION:
+		if (get_user(len, optlen))
+			return -EFAULT;
+		len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, tp->ca_ops->name, len))
+			return -EFAULT;
+		return 0;
 	default:
 		return -ENOPROTOOPT;
 	};
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 
 extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 665394a63ae4..4970d10a7785 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -21,7 +21,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 {
 	struct tcp_congestion_ops *e;
 
-	list_for_each_entry(e, &tcp_cong_list, list) {
+	list_for_each_entry_rcu(e, &tcp_cong_list, list) {
 		if (strcmp(e->name, name) == 0)
 			return e;
 	}
@@ -77,6 +77,9 @@ void tcp_init_congestion_control(struct tcp_sock *tp)
 {
 	struct tcp_congestion_ops *ca;
 
+	if (tp->ca_ops != &tcp_init_congestion_ops)
+		return;
+
 	rcu_read_lock();
 	list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
 		if (try_module_get(ca->owner)) {
@@ -139,6 +142,34 @@ void tcp_get_default_congestion_control(char *name)
 	rcu_read_unlock();
 }
 
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+{
+	struct tcp_congestion_ops *ca;
+	int err = 0;
+
+	rcu_read_lock();
+	ca = tcp_ca_find(name);
+	if (ca == tp->ca_ops)
+		goto out;
+
+	if (!ca)
+		err = -ENOENT;
+
+	else if (!try_module_get(ca->owner))
+		err = -EBUSY;
+
+	else {
+		tcp_cleanup_congestion_control(tp);
+		tp->ca_ops = ca;
+		if (tp->ca_ops->init)
+			tp->ca_ops->init(tp);
+	}
+ out:
+	rcu_read_unlock();
+	return err;
+}
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
@@ -192,4 +223,15 @@ struct tcp_congestion_ops tcp_reno = {
 	.min_cwnd	= tcp_reno_min_cwnd,
 };
 
-EXPORT_SYMBOL_GPL(tcp_reno);
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+	.name		= "",
+	.owner		= THIS_MODULE,
+	.ssthresh	= tcp_reno_ssthresh,
+	.cong_avoid	= tcp_reno_cong_avoid,
+	.min_cwnd	= tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 9122814c13ad..ebf112347a97 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2048,7 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	tp->mss_cache_std = tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 
 	sk->sk_state = TCP_CLOSE;
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fce56039b0e9..9dac7fdf4726 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_state = TCP_CLOSE;
 
 	tp->af_specific = &ipv6_specific;
-	tp->ca_ops = &tcp_reno;
+	tp->ca_ops = &tcp_init_congestion_ops;
 	sk->sk_write_space = sk_stream_write_space;
 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
 
-- 
cgit v1.2.3-59-g8ed1b


From 2de4ff7bd658c97fb357efa3095a509674dacb5a Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 20:49:30 -0700
Subject: [LIB]: Textsearch infrastructure.

The textsearch infrastructure provides text searching
facitilies for both linear and non-linear data.
Individual search algorithms are implemented in modules
and chosen by the user.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/textsearch.h | 180 +++++++++++++++++++++++++
 lib/Kconfig                |   8 +-
 lib/Makefile               |   2 +
 lib/textsearch.c           | 317 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 506 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/textsearch.h
 create mode 100644 lib/textsearch.c

(limited to 'include/linux')

diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
new file mode 100644
index 000000000000..941f45ac117a
--- /dev/null
+++ b/include/linux/textsearch.h
@@ -0,0 +1,180 @@
+#ifndef __LINUX_TEXTSEARCH_H
+#define __LINUX_TEXTSEARCH_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/err.h>
+
+struct ts_config;
+
+/**
+ * TS_AUTOLOAD - Automatically load textsearch modules when needed
+ */
+#define TS_AUTOLOAD	1
+
+/**
+ * struct ts_state - search state
+ * @offset: offset for next match
+ * @cb: control buffer, for persistant variables of get_next_block()
+ */
+struct ts_state
+{
+	unsigned int		offset;
+	char			cb[40];
+};
+
+/**
+ * struct ts_ops - search module operations
+ * @name: name of search algorithm
+ * @init: initialization function to prepare a search
+ * @find: find the next occurrence of the pattern
+ * @destroy: destroy algorithm specific parts of a search configuration
+ * @get_pattern: return head of pattern
+ * @get_pattern_len: return length of pattern
+ * @owner: module reference to algorithm
+ */
+struct ts_ops
+{
+	const char		*name;
+	struct ts_config *	(*init)(const void *, unsigned int, int);
+	unsigned int		(*find)(struct ts_config *,
+					struct ts_state *);
+	void			(*destroy)(struct ts_config *);
+	void *			(*get_pattern)(struct ts_config *);
+	unsigned int		(*get_pattern_len)(struct ts_config *);
+	struct module		*owner;
+	struct list_head	list;
+};
+
+/**
+ * struct ts_config - search configuration
+ * @ops: operations of chosen algorithm
+ * @get_next_block: callback to fetch the next block to search in
+ * @finish: callback to finalize a search
+ */
+struct ts_config
+{
+	struct ts_ops		*ops;
+
+	/**
+	 * get_next_block - fetch next block of data
+	 * @consumed: number of bytes consumed by the caller
+	 * @dst: destination buffer
+	 * @conf: search configuration
+	 * @state: search state
+	 *
+	 * Called repeatedly until 0 is returned. Must assign the
+	 * head of the next block of data to &*dst and return the length
+	 * of the block or 0 if at the end. consumed == 0 indicates
+	 * a new search. May store/read persistant values in state->cb.
+	 */
+	unsigned int		(*get_next_block)(unsigned int consumed,
+						  const u8 **dst,
+						  struct ts_config *conf,
+						  struct ts_state *state);
+
+	/**
+	 * finish - finalize/clean a series of get_next_block() calls
+	 * @conf: search configuration
+	 * @state: search state
+	 *
+	 * Called after the last use of get_next_block(), may be used
+	 * to cleanup any leftovers.
+	 */
+	void			(*finish)(struct ts_config *conf,
+					  struct ts_state *state);
+};
+
+/**
+ * textsearch_next - continue searching for a pattern
+ * @conf: search configuration
+ * @state: search state
+ *
+ * Continues a search looking for more occurrences of the pattern.
+ * textsearch_find() must be called to find the first occurrence
+ * in order to reset the state.
+ *
+ * Returns the position of the next occurrence of the pattern or
+ * UINT_MAX if not match was found.
+ */ 
+static inline unsigned int textsearch_next(struct ts_config *conf,
+					   struct ts_state *state)
+{
+	unsigned int ret = conf->ops->find(conf, state);
+
+	if (conf->finish)
+		conf->finish(conf, state);
+
+	return ret;
+}
+
+/**
+ * textsearch_find - start searching for a pattern
+ * @conf: search configuration
+ * @state: search state
+ *
+ * Returns the position of first occurrence of the pattern or
+ * UINT_MAX if no match was found.
+ */ 
+static inline unsigned int textsearch_find(struct ts_config *conf,
+					   struct ts_state *state)
+{
+	state->offset = 0;
+	return textsearch_next(conf, state);
+}
+
+/**
+ * textsearch_get_pattern - return head of the pattern
+ * @conf: search configuration
+ */
+static inline void *textsearch_get_pattern(struct ts_config *conf)
+{
+	return conf->ops->get_pattern(conf);
+}
+
+/**
+ * textsearch_get_pattern_len - return length of the pattern
+ * @conf: search configuration
+ */
+static inline unsigned int textsearch_get_pattern_len(struct ts_config *conf)
+{
+	return conf->ops->get_pattern_len(conf);
+}
+
+extern int textsearch_register(struct ts_ops *);
+extern int textsearch_unregister(struct ts_ops *);
+extern struct ts_config *textsearch_prepare(const char *, const void *,
+					    unsigned int, int, int);
+extern void textsearch_destroy(struct ts_config *conf);
+extern unsigned int textsearch_find_continuous(struct ts_config *,
+					       struct ts_state *,
+					       const void *, unsigned int);
+
+
+#define TS_PRIV_ALIGNTO	8
+#define TS_PRIV_ALIGN(len) (((len) + TS_PRIV_ALIGNTO-1) & ~(TS_PRIV_ALIGNTO-1))
+
+static inline struct ts_config *alloc_ts_config(size_t payload, int gfp_mask)
+{
+	struct ts_config *conf;
+
+	conf = kmalloc(TS_PRIV_ALIGN(sizeof(*conf)) + payload, gfp_mask);
+	if (conf == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	memset(conf, 0, TS_PRIV_ALIGN(sizeof(*conf)) + payload);
+	return conf;
+}
+
+static inline void *ts_config_priv(struct ts_config *conf)
+{
+	return ((u8 *) conf + TS_PRIV_ALIGN(sizeof(struct ts_config)));
+}
+
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 2d4d4e3bc4aa..5bc2d523e6d1 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -63,5 +63,11 @@ config REED_SOLOMON_ENC16
 config REED_SOLOMON_DEC16
 	boolean
 
-endmenu
+config TEXTSEARCH
+	boolean "Textsearch infrastructure"
+	default y
+	help
+	  Say Y here if you want to provide a textsearch infrastructure
+	  to other subsystems.
 
+endmenu
diff --git a/lib/Makefile b/lib/Makefile
index dcb4231916e2..3e917436ad60 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -36,6 +36,8 @@ obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/
 obj-$(CONFIG_ZLIB_DEFLATE) += zlib_deflate/
 obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
+lib-$(CONFIG_TEXTSEARCH) += textsearch.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/textsearch.c b/lib/textsearch.c
new file mode 100644
index 000000000000..1e934c196f0f
--- /dev/null
+++ b/lib/textsearch.c
@@ -0,0 +1,317 @@
+/*
+ * lib/textsearch.c	Generic text search interface
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ * 		Pablo Neira Ayuso <pablo@eurodev.net>
+ *
+ * ==========================================================================
+ *
+ * INTRODUCTION
+ *
+ *   The textsearch infrastructure provides text searching facitilies for
+ *   both linear and non-linear data. Individual search algorithms are
+ *   implemented in modules and chosen by the user.
+ *
+ * ARCHITECTURE
+ *
+ *      User
+ *     +----------------+
+ *     |        finish()|<--------------(6)-----------------+
+ *     |get_next_block()|<--------------(5)---------------+ |
+ *     |                |                     Algorithm   | |
+ *     |                |                    +------------------------------+
+ *     |                |                    |  init()   find()   destroy() |
+ *     |                |                    +------------------------------+
+ *     |                |       Core API           ^       ^          ^
+ *     |                |      +---------------+  (2)     (4)        (8)
+ *     |             (1)|----->| prepare()     |---+       |          |
+ *     |             (3)|----->| find()/next() |-----------+          |
+ *     |             (7)|----->| destroy()     |----------------------+
+ *     +----------------+      +---------------+
+ *  
+ *   (1) User configures a search by calling _prepare() specifying the
+ *       search parameters such as the pattern and algorithm name.
+ *   (2) Core requests the algorithm to allocate and initialize a search
+ *       configuration according to the specified parameters.
+ *   (3) User starts the search(es) by calling _find() or _next() to
+ *       fetch subsequent occurrences. A state variable is provided
+ *       to the algorihtm to store persistant variables.
+ *   (4) Core eventually resets the search offset and forwards the find()
+ *       request to the algorithm.
+ *   (5) Algorithm calls get_next_block() provided by the user continously
+ *       to fetch the data to be searched in block by block.
+ *   (6) Algorithm invokes finish() after the last call to get_next_block
+ *       to clean up any leftovers from get_next_block. (Optional)
+ *   (7) User destroys the configuration by calling _destroy().
+ *   (8) Core notifies the algorithm to destroy algorithm specific
+ *       allocations. (Optional)
+ *
+ * USAGE
+ *
+ *   Before a search can be performed, a configuration must be created
+ *   by calling textsearch_prepare() specyfing the searching algorithm and
+ *   the pattern to look for. The returned configuration may then be used
+ *   for an arbitary amount of times and even in parallel as long as a
+ *   separate struct ts_state variable is provided to every instance.
+ *
+ *   The actual search is performed by either calling textsearch_find_-
+ *   continuous() for linear data or by providing an own get_next_block()
+ *   implementation and calling textsearch_find(). Both functions return
+ *   the position of the first occurrence of the patern or UINT_MAX if
+ *   no match was found. Subsequent occurences can be found by calling
+ *   textsearch_next() regardless of the linearity of the data.
+ *
+ *   Once you're done using a configuration it must be given back via
+ *   textsearch_destroy.
+ *
+ * EXAMPLE
+ *
+ *   int pos;
+ *   struct ts_config *conf;
+ *   struct ts_state state;
+ *   const char *pattern = "chicken";
+ *   const char *example = "We dance the funky chicken";
+ *
+ *   conf = textsearch_prepare("kmp", pattern, strlen(pattern),
+ *                             GFP_KERNEL, TS_AUTOLOAD);
+ *   if (IS_ERR(conf)) {
+ *       err = PTR_ERR(conf);
+ *       goto errout;
+ *   }
+ *
+ *   pos = textsearch_find_continuous(conf, &state, example, strlen(example));
+ *   if (pos != UINT_MAX)
+ *       panic("Oh my god, dancing chickens at %d\n", pos);
+ *
+ *   textsearch_destroy(conf);
+ *
+ * ==========================================================================
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/rcupdate.h>
+#include <linux/err.h>
+#include <linux/textsearch.h>
+
+static LIST_HEAD(ts_ops);
+static DEFINE_SPINLOCK(ts_mod_lock);
+
+static inline struct ts_ops *lookup_ts_algo(const char *name)
+{
+	struct ts_ops *o;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(o, &ts_ops, list) {
+		if (!strcmp(name, o->name)) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			rcu_read_unlock();
+			return o;
+		}
+	}
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+/**
+ * textsearch_register - register a textsearch module
+ * @ops: operations lookup table
+ *
+ * This function must be called by textsearch modules to announce
+ * their presence. The specified &@ops must have %name set to a
+ * unique identifier and the callbacks find(), init(), get_pattern(),
+ * and get_pattern_len() must be implemented.
+ *
+ * Returns 0 or -EEXISTS if another module has already registered
+ * with same name.
+ */
+int textsearch_register(struct ts_ops *ops)
+{
+	int err = -EEXIST;
+	struct ts_ops *o;
+
+	if (ops->name == NULL || ops->find == NULL || ops->init == NULL ||
+	    ops->get_pattern == NULL || ops->get_pattern_len == NULL)
+		return -EINVAL;
+
+	spin_lock(&ts_mod_lock);
+	list_for_each_entry(o, &ts_ops, list) {
+		if (!strcmp(ops->name, o->name))
+			goto errout;
+	}
+
+	list_add_tail_rcu(&ops->list, &ts_ops);
+	err = 0;
+errout:
+	spin_unlock(&ts_mod_lock);
+	return err;
+}
+
+/**
+ * textsearch_unregister - unregister a textsearch module
+ * @ops: operations lookup table
+ *
+ * This function must be called by textsearch modules to announce
+ * their disappearance for examples when the module gets unloaded.
+ * The &ops parameter must be the same as the one during the
+ * registration.
+ *
+ * Returns 0 on success or -ENOENT if no matching textsearch
+ * registration was found.
+ */
+int textsearch_unregister(struct ts_ops *ops)
+{
+	int err = 0;
+	struct ts_ops *o;
+
+	spin_lock(&ts_mod_lock);
+	list_for_each_entry(o, &ts_ops, list) {
+		if (o == ops) {
+			list_del_rcu(&o->list);
+			goto out;
+		}
+	}
+
+	err = -ENOENT;
+out:
+	spin_unlock(&ts_mod_lock);
+	return err;
+}
+
+struct ts_linear_state
+{
+	unsigned int	len;
+	const void	*data;
+};
+
+static unsigned int get_linear_data(unsigned int consumed, const u8 **dst,
+				    struct ts_config *conf,
+				    struct ts_state *state)
+{
+	struct ts_linear_state *st = (struct ts_linear_state *) state->cb;
+
+	if (likely(consumed < st->len)) {
+		*dst = st->data + consumed;
+		return st->len - consumed;
+	}
+
+	return 0;
+}
+
+/**
+ * textsearch_find_continuous - search a pattern in continuous/linear data
+ * @conf: search configuration
+ * @state: search state
+ * @data: data to search in
+ * @len: length of data
+ *
+ * A simplified version of textsearch_find() for continuous/linear data.
+ * Call textsearch_next() to retrieve subsequent matches.
+ *
+ * Returns the position of first occurrence of the pattern or
+ * UINT_MAX if no occurrence was found.
+ */ 
+unsigned int textsearch_find_continuous(struct ts_config *conf,
+					struct ts_state *state,
+					const void *data, unsigned int len)
+{
+	struct ts_linear_state *st = (struct ts_linear_state *) state->cb;
+
+	conf->get_next_block = get_linear_data;
+	st->data = data;
+	st->len = len;
+
+	return textsearch_find(conf, state);
+}
+
+/**
+ * textsearch_prepare - Prepare a search
+ * @algo: name of search algorithm
+ * @pattern: pattern data
+ * @len: length of pattern
+ * @gfp_mask: allocation mask
+ * @flags: search flags
+ *
+ * Looks up the search algorithm module and creates a new textsearch
+ * configuration for the specified pattern. Upon completion all
+ * necessary refcnts are held and the configuration must be put back
+ * using textsearch_put() after usage.
+ *
+ * Note: The format of the pattern may not be compatible between
+ *       the various search algorithms.
+ *
+ * Returns a new textsearch configuration according to the specified
+ *         parameters or a ERR_PTR().
+ */
+struct ts_config *textsearch_prepare(const char *algo, const void *pattern,
+				     unsigned int len, int gfp_mask, int flags)
+{
+	int err = -ENOENT;
+	struct ts_config *conf;
+	struct ts_ops *ops;
+	
+	ops = lookup_ts_algo(algo);
+#ifdef CONFIG_KMOD
+	/*
+	 * Why not always autoload you may ask. Some users are
+	 * in a situation where requesting a module may deadlock,
+	 * especially when the module is located on a NFS mount.
+	 */
+	if (ops == NULL && flags & TS_AUTOLOAD) {
+		request_module("ts_%s", algo);
+		ops = lookup_ts_algo(algo);
+	}
+#endif
+
+	if (ops == NULL)
+		goto errout;
+
+	conf = ops->init(pattern, len, gfp_mask);
+	if (IS_ERR(conf)) {
+		err = PTR_ERR(conf);
+		goto errout;
+	}
+
+	conf->ops = ops;
+	return conf;
+
+errout:
+	if (ops)
+		module_put(ops->owner);
+		
+	return ERR_PTR(err);
+}
+
+/**
+ * textsearch_destroy - destroy a search configuration
+ * @conf: search configuration
+ *
+ * Releases all references of the configuration and frees
+ * up the memory.
+ */
+void textsearch_destroy(struct ts_config *conf)
+{
+	if (conf->ops) {
+		if (conf->ops->destroy)
+			conf->ops->destroy(conf);
+		module_put(conf->ops->owner);
+	}
+
+	kfree(conf);
+}
+
+EXPORT_SYMBOL(textsearch_register);
+EXPORT_SYMBOL(textsearch_unregister);
+EXPORT_SYMBOL(textsearch_prepare);
+EXPORT_SYMBOL(textsearch_find_continuous);
+EXPORT_SYMBOL(textsearch_destroy);
-- 
cgit v1.2.3-59-g8ed1b


From 6408f79cce401e1bfecf923e7156f84f96e021e3 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 20:59:16 -0700
Subject: [LIB]: Naive finite state machine based textsearch

A finite state machine consists of n states (struct ts_fsm_token)
representing the pattern as a finite automation. The data is read
sequentially on a octet basis. Every state token specifies the number
of recurrences and the type of value accepted which can be either a
specific character or ctype based set of characters. The available
type of recurrences include 1, (0|1), [0 n], and [1 n].

The algorithm differs between strict/non-strict mode specyfing
whether the pattern has to start at the first octect. Strict mode
is enabled by default and can be disabled by inserting
TS_FSM_HEAD_IGNORE as the first token in the chain.

The runtime performance of the algorithm should be around O(n),
however while in strict mode the average runtime can be better.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/textsearch_fsm.h |  48 ++++++
 lib/Kconfig                    |  11 ++
 lib/Makefile                   |   1 +
 lib/ts_fsm.c                   | 338 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 398 insertions(+)
 create mode 100644 include/linux/textsearch_fsm.h
 create mode 100644 lib/ts_fsm.c

(limited to 'include/linux')

diff --git a/include/linux/textsearch_fsm.h b/include/linux/textsearch_fsm.h
new file mode 100644
index 000000000000..fdfa078c66e5
--- /dev/null
+++ b/include/linux/textsearch_fsm.h
@@ -0,0 +1,48 @@
+#ifndef __LINUX_TEXTSEARCH_FSM_H
+#define __LINUX_TEXTSEARCH_FSM_H
+
+#include <linux/types.h>
+
+enum {
+	TS_FSM_SPECIFIC,	/* specific character */
+	TS_FSM_WILDCARD,	/* any character */
+	TS_FSM_DIGIT,		/* isdigit() */
+	TS_FSM_XDIGIT,		/* isxdigit() */
+	TS_FSM_PRINT,		/* isprint() */
+	TS_FSM_ALPHA,		/* isalpha() */
+	TS_FSM_ALNUM,		/* isalnum() */
+	TS_FSM_ASCII,		/* isascii() */
+	TS_FSM_CNTRL,		/* iscntrl() */
+	TS_FSM_GRAPH,		/* isgraph() */
+	TS_FSM_LOWER,		/* islower() */
+	TS_FSM_UPPER,		/* isupper() */
+	TS_FSM_PUNCT,		/* ispunct() */
+	TS_FSM_SPACE,		/* isspace() */
+	__TS_FSM_TYPE_MAX,
+};
+#define TS_FSM_TYPE_MAX (__TS_FSM_TYPE_MAX - 1)
+
+enum {
+	TS_FSM_SINGLE,		/* 1 occurrence */
+	TS_FSM_PERHAPS,		/* 1 or 0 occurrence */
+	TS_FSM_ANY,		/* 0..n occurrences */
+	TS_FSM_MULTI,		/* 1..n occurrences */
+	TS_FSM_HEAD_IGNORE,	/* 0..n ignored occurrences at head */
+	__TS_FSM_RECUR_MAX,
+};
+#define TS_FSM_RECUR_MAX (__TS_FSM_RECUR_MAX - 1)
+
+/**
+ * struct ts_fsm_token - state machine token (state)
+ * @type: type of token
+ * @recur: number of recurrences
+ * @value: character value for TS_FSM_SPECIFIC
+ */
+struct ts_fsm_token
+{
+	__u16		type;
+	__u8		recur;
+	__u8		value;
+};
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 16b8fa2175e4..455833a9e31a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -80,4 +80,15 @@ config TEXTSEARCH_KMP
 	  To compile this code as a module, choose M here: the
 	  module will be called ts_kmp.
 
+config TEXTSEARCH_FSM
+	depends on TEXTSEARCH
+	tristate "Finite state machine"
+	help
+	  Say Y here if you want to be able to search text using a
+	  naive finite state machine approach implementing a subset
+	  of regular expressions.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called ts_fsm.
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 6cdb10f312df..7f6eda449102 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_REED_SOLOMON) += reed_solomon/
 
 lib-$(CONFIG_TEXTSEARCH) += textsearch.o
 obj-$(CONFIG_TEXTSEARCH_KMP) += ts_kmp.o
+obj-$(CONFIG_TEXTSEARCH_FSM) += ts_fsm.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c
new file mode 100644
index 000000000000..d27c0a072940
--- /dev/null
+++ b/lib/ts_fsm.c
@@ -0,0 +1,338 @@
+/*
+ * lib/ts_fsm.c	   A naive finite state machine text search approach
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ *
+ * ==========================================================================
+ *
+ *   A finite state machine consists of n states (struct ts_fsm_token)
+ *   representing the pattern as a finite automation. The data is read
+ *   sequentially on a octet basis. Every state token specifies the number
+ *   of recurrences and the type of value accepted which can be either a
+ *   specific character or ctype based set of characters. The available
+ *   type of recurrences include 1, (0|1), [0 n], and [1 n].
+ *
+ *   The algorithm differs between strict/non-strict mode specyfing
+ *   whether the pattern has to start at the first octect. Strict mode
+ *   is enabled by default and can be disabled by inserting
+ *   TS_FSM_HEAD_IGNORE as the first token in the chain.
+ *
+ *   The runtime performance of the algorithm should be around O(n),
+ *   however while in strict mode the average runtime can be better.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/textsearch.h>
+#include <linux/textsearch_fsm.h>
+
+struct ts_fsm
+{
+	unsigned int		ntokens;
+	struct ts_fsm_token	tokens[0];
+};
+
+/* other values derived from ctype.h */
+#define _A		0x100 /* ascii */
+#define _W		0x200 /* wildcard */
+
+/* Map to _ctype flags and some magic numbers */
+static u16 token_map[TS_FSM_TYPE_MAX+1] = {
+	[TS_FSM_SPECIFIC] = 0,
+	[TS_FSM_WILDCARD] = _W,
+	[TS_FSM_CNTRL]	  = _C,
+	[TS_FSM_LOWER]	  = _L,
+	[TS_FSM_UPPER]	  = _U,
+	[TS_FSM_PUNCT]	  = _P,
+	[TS_FSM_SPACE]	  = _S,
+	[TS_FSM_DIGIT]	  = _D,
+	[TS_FSM_XDIGIT]	  = _D | _X,
+	[TS_FSM_ALPHA]	  = _U | _L,
+	[TS_FSM_ALNUM]	  = _U | _L | _D,
+	[TS_FSM_PRINT]	  = _P | _U | _L | _D | _SP,
+	[TS_FSM_GRAPH]	  = _P | _U | _L | _D,
+	[TS_FSM_ASCII]	  = _A,
+};
+
+static u16 token_lookup_tbl[256] = {
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   0-  3 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*   4-  7 */
+_W|_A|_C,      _W|_A|_C|_S,  _W|_A|_C|_S,  _W|_A|_C|_S,		/*   8- 11 */
+_W|_A|_C|_S,   _W|_A|_C|_S,  _W|_A|_C,     _W|_A|_C,		/*  12- 15 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  16- 19 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  20- 23 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  24- 27 */
+_W|_A|_C,      _W|_A|_C,     _W|_A|_C,     _W|_A|_C,		/*  28- 31 */
+_W|_A|_S|_SP,  _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  32- 35 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  36- 39 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  40- 43 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  44- 47 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  48- 51 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_D,     _W|_A|_D,		/*  52- 55 */
+_W|_A|_D,      _W|_A|_D,     _W|_A|_P,     _W|_A|_P,		/*  56- 59 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  60- 63 */
+_W|_A|_P,      _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U|_X,		/*  64- 67 */
+_W|_A|_U|_X,   _W|_A|_U|_X,  _W|_A|_U|_X,  _W|_A|_U,		/*  68- 71 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  72- 75 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  76- 79 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  80- 83 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_U,		/*  84- 87 */
+_W|_A|_U,      _W|_A|_U,     _W|_A|_U,     _W|_A|_P,		/*  88- 91 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_P,		/*  92- 95 */
+_W|_A|_P,      _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L|_X,		/*  96- 99 */
+_W|_A|_L|_X,   _W|_A|_L|_X,  _W|_A|_L|_X,  _W|_A|_L,		/* 100-103 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 104-107 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 108-111 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 112-115 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_L,		/* 116-119 */
+_W|_A|_L,      _W|_A|_L,     _W|_A|_L,     _W|_A|_P,		/* 120-123 */
+_W|_A|_P,      _W|_A|_P,     _W|_A|_P,     _W|_A|_C,		/* 124-127 */
+_W,            _W,           _W,           _W,			/* 128-131 */
+_W,            _W,           _W,           _W,			/* 132-135 */
+_W,            _W,           _W,           _W,			/* 136-139 */
+_W,            _W,           _W,           _W,			/* 140-143 */
+_W,            _W,           _W,           _W,			/* 144-147 */
+_W,            _W,           _W,           _W,			/* 148-151 */
+_W,            _W,           _W,           _W,			/* 152-155 */
+_W,            _W,           _W,           _W,			/* 156-159 */
+_W|_S|_SP,     _W|_P,        _W|_P,        _W|_P,		/* 160-163 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 164-167 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 168-171 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 172-175 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 176-179 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 180-183 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 184-187 */
+_W|_P,         _W|_P,        _W|_P,        _W|_P,		/* 188-191 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 192-195 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 196-199 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 200-203 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 204-207 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 208-211 */
+_W|_U,         _W|_U,        _W|_U,        _W|_P,		/* 212-215 */
+_W|_U,         _W|_U,        _W|_U,        _W|_U,		/* 216-219 */
+_W|_U,         _W|_U,        _W|_U,        _W|_L,		/* 220-223 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 224-227 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 228-231 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 232-235 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 236-239 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 240-243 */
+_W|_L,         _W|_L,        _W|_L,        _W|_P,		/* 244-247 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L,		/* 248-251 */
+_W|_L,         _W|_L,        _W|_L,        _W|_L};		/* 252-255 */
+
+static inline int match_token(struct ts_fsm_token *t, u8 d)
+{
+	if (t->type)
+		return (token_lookup_tbl[d] & t->type) != 0;
+	else
+		return t->value == d;
+}
+
+static unsigned int fsm_find(struct ts_config *conf, struct ts_state *state)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	struct ts_fsm_token *cur = NULL, *next;
+	unsigned int match_start, block_idx = 0, tok_idx;
+	unsigned block_len = 0, strict, consumed = state->offset;
+	const u8 *data;
+
+#define GET_NEXT_BLOCK()		\
+({	consumed += block_idx;		\
+	block_idx = 0;			\
+	block_len = conf->get_next_block(consumed, &data, conf, state); })
+
+#define TOKEN_MISMATCH()		\
+	do {				\
+		if (strict)		\
+			goto no_match;	\
+		block_idx++;		\
+		goto startover;		\
+	} while(0)
+
+#define end_of_data() unlikely(block_idx >= block_len && !GET_NEXT_BLOCK())
+
+	if (end_of_data())
+		goto no_match;
+
+	strict = fsm->tokens[0].recur != TS_FSM_HEAD_IGNORE;
+
+startover:
+	match_start = consumed + block_idx;
+
+	for (tok_idx = 0; tok_idx < fsm->ntokens; tok_idx++) {
+		cur = &fsm->tokens[tok_idx];
+
+		if (likely(tok_idx < (fsm->ntokens - 1)))
+			next = &fsm->tokens[tok_idx + 1];
+		else
+			next = NULL;
+
+		switch (cur->recur) {
+		case TS_FSM_SINGLE:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+			break;
+
+		case TS_FSM_PERHAPS:
+			if (end_of_data() ||
+			    !match_token(cur, data[block_idx]))
+				continue;
+			break;
+
+		case TS_FSM_MULTI:
+			if (end_of_data())
+				goto no_match;
+
+			if (!match_token(cur, data[block_idx]))
+				TOKEN_MISMATCH();
+
+			block_idx++;
+			/* fall through */
+
+		case TS_FSM_ANY:
+			if (next == NULL)
+				goto found_match;
+
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				if (!match_token(cur, data[block_idx]))
+					TOKEN_MISMATCH();
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+			continue;
+
+		/*
+		 * Optimization: Prefer small local loop over jumping
+		 * back and forth until garbage at head is munched.
+		 */
+		case TS_FSM_HEAD_IGNORE:
+			if (end_of_data())
+				continue;
+
+			while (!match_token(next, data[block_idx])) {
+				/*
+				 * Special case, don't start over upon
+				 * a mismatch, give the user the
+				 * chance to specify the type of data
+				 * allowed to be ignored.
+				 */
+				if (!match_token(cur, data[block_idx]))
+					goto no_match;
+
+				block_idx++;
+				if (end_of_data())
+					goto no_match;
+			}
+
+			match_start = consumed + block_idx;
+			continue;
+		}
+
+		block_idx++;
+	}
+
+	if (end_of_data())
+		goto found_match;
+
+no_match:
+	return UINT_MAX;
+
+found_match:
+	state->offset = consumed + block_idx;
+	return match_start;
+}
+
+static struct ts_config *fsm_init(const void *pattern, unsigned int len,
+				     int gfp_mask)
+{
+	int i, err = -EINVAL;
+	struct ts_config *conf;
+	struct ts_fsm *fsm;
+	struct ts_fsm_token *tokens = (struct ts_fsm_token *) pattern;
+	unsigned int ntokens = len / sizeof(*tokens);
+	size_t priv_size = sizeof(*fsm) + len;
+
+	if (len  % sizeof(struct ts_fsm_token) || ntokens < 1)
+		goto errout;
+
+	for (i = 0; i < ntokens; i++) {
+		struct ts_fsm_token *t = &tokens[i];
+
+		if (t->type > TS_FSM_TYPE_MAX || t->recur > TS_FSM_RECUR_MAX)
+			goto errout;
+
+		if (t->recur == TS_FSM_HEAD_IGNORE &&
+		    (i != 0 || i == (ntokens - 1)))
+			goto errout;
+	}
+
+	conf = alloc_ts_config(priv_size, gfp_mask);
+	if (IS_ERR(conf))
+		return conf;
+
+	fsm = ts_config_priv(conf);
+	fsm->ntokens = ntokens;
+	memcpy(fsm->tokens, pattern, len);
+
+	for (i = 0; i < fsm->ntokens; i++) {
+		struct ts_fsm_token *t = &fsm->tokens[i];
+		t->type = token_map[t->type];
+	}
+
+	return conf;
+
+errout:
+	return ERR_PTR(err);
+}
+
+static void *fsm_get_pattern(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->tokens;
+}
+
+static unsigned int fsm_get_pattern_len(struct ts_config *conf)
+{
+	struct ts_fsm *fsm = ts_config_priv(conf);
+	return fsm->ntokens * sizeof(struct ts_fsm_token);
+}
+
+static struct ts_ops fsm_ops = {
+	.name		  = "fsm",
+	.find		  = fsm_find,
+	.init		  = fsm_init,
+	.get_pattern	  = fsm_get_pattern,
+	.get_pattern_len  = fsm_get_pattern_len,
+	.owner		  = THIS_MODULE,
+	.list		  = LIST_HEAD_INIT(fsm_ops.list)
+};
+
+static int __init init_fsm(void)
+{
+	return textsearch_register(&fsm_ops);
+}
+
+static void __exit exit_fsm(void)
+{
+	textsearch_unregister(&fsm_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_fsm);
+module_exit(exit_fsm);
-- 
cgit v1.2.3-59-g8ed1b


From 677e90eda3bd8cfde0b748daaa46476162a03950 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 20:59:51 -0700
Subject: [NET]: Zerocopy sequential reading of skb data

Implements sequential reading for both linear and non-linear
skb data at zerocopy cost. The data is returned in chunks of
arbitary length, therefore random access is not possible.

Usage:
	from	 := 0
	to	 := 128
	state	 := undef
	data	 := undef
	len	 := undef
	consumed := 0

	skb_prepare_seq_read(skb, from, to, &state)
	while (len = skb_seq_read(consumed, &data, &state)) != 0 do
		/* do something with 'data' of length 'len' */
		if abort then
			/* abort read if we don't wait for
			 * skb_seq_read() to return 0 */
			skb_abort_seq_read(&state)
			return
		endif
		/* not necessary to consume all of 'len' */
		consumed += len
	done

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  18 ++++++++
 net/core/skbuff.c      | 117 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d7c839a21842..171a37dff83a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -321,6 +321,24 @@ extern void	      skb_over_panic(struct sk_buff *skb, int len,
 extern void	      skb_under_panic(struct sk_buff *skb, int len,
 				      void *here);
 
+struct skb_seq_state
+{
+	__u32		lower_offset;
+	__u32		upper_offset;
+	__u32		frag_idx;
+	__u32		stepped_offset;
+	struct sk_buff	*root_skb;
+	struct sk_buff	*cur_skb;
+	__u8		*frag_data;
+};
+
+extern void	      skb_prepare_seq_read(struct sk_buff *skb,
+					   unsigned int from, unsigned int to,
+					   struct skb_seq_state *st);
+extern unsigned int   skb_seq_read(unsigned int consumed, const u8 **data,
+				   struct skb_seq_state *st);
+extern void	      skb_abort_seq_read(struct skb_seq_state *st);
+
 /* Internal */
 #define skb_shinfo(SKB)		((struct skb_shared_info *)((SKB)->end))
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6d68c03bc051..d285f2f7e812 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1500,6 +1500,120 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
 		skb_split_no_header(skb, skb1, len, pos);
 }
 
+/**
+ * skb_prepare_seq_read - Prepare a sequential read of skb data
+ * @skb: the buffer to read
+ * @from: lower offset of data to be read
+ * @to: upper offset of data to be read
+ * @st: state variable
+ *
+ * Initializes the specified state variable. Must be called before
+ * invoking skb_seq_read() for the first time.
+ */
+void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
+			  unsigned int to, struct skb_seq_state *st)
+{
+	st->lower_offset = from;
+	st->upper_offset = to;
+	st->root_skb = st->cur_skb = skb;
+	st->frag_idx = st->stepped_offset = 0;
+	st->frag_data = NULL;
+}
+
+/**
+ * skb_seq_read - Sequentially read skb data
+ * @consumed: number of bytes consumed by the caller so far
+ * @data: destination pointer for data to be returned
+ * @st: state variable
+ *
+ * Reads a block of skb data at &consumed relative to the
+ * lower offset specified to skb_prepare_seq_read(). Assigns
+ * the head of the data block to &data and returns the length
+ * of the block or 0 if the end of the skb data or the upper
+ * offset has been reached.
+ *
+ * The caller is not required to consume all of the data
+ * returned, i.e. &consumed is typically set to the number
+ * of bytes already consumed and the next call to
+ * skb_seq_read() will return the remaining part of the block.
+ *
+ * Note: The size of each block of data returned can be arbitary,
+ *       this limitation is the cost for zerocopy seqeuental
+ *       reads of potentially non linear data.
+ *
+ * Note: Fragment lists within fragments are not implemented
+ *       at the moment, state->root_skb could be replaced with
+ *       a stack for this purpose.
+ */
+unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
+			  struct skb_seq_state *st)
+{
+	unsigned int block_limit, abs_offset = consumed + st->lower_offset;
+	skb_frag_t *frag;
+
+	if (unlikely(abs_offset >= st->upper_offset))
+		return 0;
+
+next_skb:
+	block_limit = skb_headlen(st->cur_skb);
+
+	if (abs_offset < block_limit) {
+		*data = st->cur_skb->data + abs_offset;
+		return block_limit - abs_offset;
+	}
+
+	if (st->frag_idx == 0 && !st->frag_data)
+		st->stepped_offset += skb_headlen(st->cur_skb);
+
+	while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+		frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
+		block_limit = frag->size + st->stepped_offset;
+
+		if (abs_offset < block_limit) {
+			if (!st->frag_data)
+				st->frag_data = kmap_skb_frag(frag);
+
+			*data = (u8 *) st->frag_data + frag->page_offset +
+				(abs_offset - st->stepped_offset);
+
+			return block_limit - abs_offset;
+		}
+
+		if (st->frag_data) {
+			kunmap_skb_frag(st->frag_data);
+			st->frag_data = NULL;
+		}
+
+		st->frag_idx++;
+		st->stepped_offset += frag->size;
+	}
+
+	if (st->cur_skb->next) {
+		st->cur_skb = st->cur_skb->next;
+		st->frag_idx = 0;
+		goto next_skb;
+	} else if (st->root_skb == st->cur_skb &&
+		   skb_shinfo(st->root_skb)->frag_list) {
+		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
+		goto next_skb;
+	}
+
+	return 0;
+}
+
+/**
+ * skb_abort_seq_read - Abort a sequential read of skb data
+ * @st: state variable
+ *
+ * Must be called if skb_seq_read() was not called until it
+ * returned 0.
+ */
+void skb_abort_seq_read(struct skb_seq_state *st)
+{
+	if (st->frag_data)
+		kunmap_skb_frag(st->frag_data);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1538,3 +1652,6 @@ EXPORT_SYMBOL(skb_queue_tail);
 EXPORT_SYMBOL(skb_unlink);
 EXPORT_SYMBOL(skb_append);
 EXPORT_SYMBOL(skb_split);
+EXPORT_SYMBOL(skb_prepare_seq_read);
+EXPORT_SYMBOL(skb_seq_read);
+EXPORT_SYMBOL(skb_abort_seq_read);
-- 
cgit v1.2.3-59-g8ed1b


From 3fc7e8a6d842f72d16d2623b1022814a635ab961 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:17 -0700
Subject: [NET]: skb_find_text() - Find a text pattern in skb data

Finds a pattern in the skb data according to the specified
textsearch configuration. Use textsearch_next() to retrieve
subsequent occurrences of the pattern. Returns the offset
to the first occurrence or UINT_MAX if no match was found.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  5 +++++
 net/core/skbuff.c      | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 171a37dff83a..416a2e4024b2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/poll.h>
 #include <linux/net.h>
+#include <linux/textsearch.h>
 #include <net/checksum.h>
 
 #define HAVE_ALLOC_SKB		/* For the drivers to know */
@@ -339,6 +340,10 @@ extern unsigned int   skb_seq_read(unsigned int consumed, const u8 **data,
 				   struct skb_seq_state *st);
 extern void	      skb_abort_seq_read(struct skb_seq_state *st);
 
+extern unsigned int   skb_find_text(struct sk_buff *skb, unsigned int from,
+				    unsigned int to, struct ts_config *config,
+				    struct ts_state *state);
+
 /* Internal */
 #define skb_shinfo(SKB)		((struct skb_shared_info *)((SKB)->end))
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d285f2f7e812..bb73b2190ec7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1614,6 +1614,45 @@ void skb_abort_seq_read(struct skb_seq_state *st)
 		kunmap_skb_frag(st->frag_data);
 }
 
+#define TS_SKB_CB(state)	((struct skb_seq_state *) &((state)->cb))
+
+static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
+					  struct ts_config *conf,
+					  struct ts_state *state)
+{
+	return skb_seq_read(offset, text, TS_SKB_CB(state));
+}
+
+static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
+{
+	skb_abort_seq_read(TS_SKB_CB(state));
+}
+
+/**
+ * skb_find_text - Find a text pattern in skb data
+ * @skb: the buffer to look in
+ * @from: search offset
+ * @to: search limit
+ * @config: textsearch configuration
+ * @state: uninitialized textsearch state variable
+ *
+ * Finds a pattern in the skb data according to the specified
+ * textsearch configuration. Use textsearch_next() to retrieve
+ * subsequent occurrences of the pattern. Returns the offset
+ * to the first occurrence or UINT_MAX if no match was found.
+ */
+unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
+			   unsigned int to, struct ts_config *config,
+			   struct ts_state *state)
+{
+	config->get_next_block = skb_ts_get_next_block;
+	config->finish = skb_ts_finish;
+
+	skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state));
+
+	return textsearch_find(config, state);
+}
+
 void __init skb_init(void)
 {
 	skbuff_head_cache = kmem_cache_create("skbuff_head_cache",
@@ -1655,3 +1694,4 @@ EXPORT_SYMBOL(skb_split);
 EXPORT_SYMBOL(skb_prepare_seq_read);
 EXPORT_SYMBOL(skb_seq_read);
 EXPORT_SYMBOL(skb_abort_seq_read);
+EXPORT_SYMBOL(skb_find_text);
-- 
cgit v1.2.3-59-g8ed1b


From d675c989ed2d4ba23dff615330b04371aea83534 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Thu, 23 Jun 2005 21:00:58 -0700
Subject: [PKT_SCHED]: Packet classification based on textsearch (ematch)

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h              |   1 +
 include/linux/rtnetlink.h            |   7 +-
 include/linux/tc_ematch/tc_em_text.h |  19 +++++
 net/sched/Kconfig                    |  11 +++
 net/sched/Makefile                   |   1 +
 net/sched/em_text.c                  | 157 +++++++++++++++++++++++++++++++++++
 6 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/tc_ematch/tc_em_text.h
 create mode 100644 net/sched/em_text.c

(limited to 'include/linux')

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index d2aa214d6803..25d2d67c1faf 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -408,6 +408,7 @@ enum
 	TCF_EM_NBYTE,
 	TCF_EM_U32,
 	TCF_EM_META,
+	TCF_EM_TEXT,
 	__TCF_EM_MAX
 };
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index e68dbf0bf579..d021888b58f1 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -892,10 +892,13 @@ extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const voi
 		 goto rtattr_failure; \
    	__rta_fill(skb, attrtype, attrlen, data); }) 
 
-#define RTA_PUT_NOHDR(skb, attrlen, data) \
+#define RTA_APPEND(skb, attrlen, data) \
 ({	if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \
 		goto rtattr_failure; \
-	memcpy(skb_put(skb, RTA_ALIGN(attrlen)), data, attrlen); })
+	memcpy(skb_put(skb, attrlen), data, attrlen); })
+
+#define RTA_PUT_NOHDR(skb, attrlen, data) \
+	RTA_APPEND(skb, RTA_ALIGN(attrlen), data)
 
 #define RTA_PUT_U8(skb, attrtype, value) \
 ({	u8 _tmp = (value); \
diff --git a/include/linux/tc_ematch/tc_em_text.h b/include/linux/tc_ematch/tc_em_text.h
new file mode 100644
index 000000000000..7cd43e99c7f5
--- /dev/null
+++ b/include/linux/tc_ematch/tc_em_text.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_TC_EM_TEXT_H
+#define __LINUX_TC_EM_TEXT_H
+
+#include <linux/pkt_cls.h>
+
+#define TC_EM_TEXT_ALGOSIZ	16
+
+struct tcf_em_text
+{
+	char		algo[TC_EM_TEXT_ALGOSIZ];
+	__u16		from_offset;
+	__u16		to_offset;
+	__u16		pattern_len;
+	__u8		from_layer:4;
+	__u8		to_layer:4;
+	__u8		pad;
+};
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index b22c9beb604d..95d9bc5d8621 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -449,6 +449,17 @@ config NET_EMATCH_META
 	  To compile this code as a module, choose M here: the
 	  module will be called em_meta.
 
+config NET_EMATCH_TEXT
+	tristate "Textsearch"
+	depends on NET_EMATCH
+	---help---
+	  Say Y here if you want to be ablt to classify packets based on
+	  textsearch comparisons. Please select the appropriate textsearch
+	  algorithms in the Library section.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called em_text.
+
 config NET_CLS_ACT
 	bool "Packet ACTION"
 	depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index eb3fe583eba8..8f58cecd6266 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP)	+= em_cmp.o
 obj-$(CONFIG_NET_EMATCH_NBYTE)	+= em_nbyte.o
 obj-$(CONFIG_NET_EMATCH_U32)	+= em_u32.o
 obj-$(CONFIG_NET_EMATCH_META)	+= em_meta.o
+obj-$(CONFIG_NET_EMATCH_TEXT)	+= em_text.o
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
new file mode 100644
index 000000000000..873840d8d072
--- /dev/null
+++ b/net/sched/em_text.c
@@ -0,0 +1,157 @@
+/*
+ * net/sched/em_text.c	Textsearch ematch
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Authors:	Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/textsearch.h>
+#include <linux/tc_ematch/tc_em_text.h>
+#include <net/pkt_cls.h>
+
+struct text_match
+{
+	u16			from_offset;
+	u16			to_offset;
+	u8			from_layer;
+	u8			to_layer;
+	struct ts_config	*config;
+};
+
+#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data)
+
+static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m,
+			 struct tcf_pkt_info *info)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	int from, to;
+	struct ts_state state;
+
+	from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data;
+	from += tm->from_offset;
+
+	to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data;
+	to += tm->to_offset;
+
+	return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX;
+}
+
+static int em_text_change(struct tcf_proto *tp, void *data, int len,
+			  struct tcf_ematch *m)
+{
+	struct text_match *tm;
+	struct tcf_em_text *conf = data;
+	struct ts_config *ts_conf;
+	int flags = 0;
+
+	printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset,
+	    conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len);
+
+	if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len))
+		return -EINVAL;
+
+	if (conf->from_layer > conf->to_layer)
+		return -EINVAL;
+
+	if (conf->from_layer == conf->to_layer &&
+	    conf->from_offset > conf->to_offset)
+		return -EINVAL;
+
+retry:
+	ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf),
+				     conf->pattern_len, GFP_KERNEL, flags);
+
+	if (flags & TS_AUTOLOAD)
+		rtnl_lock();
+
+	if (IS_ERR(ts_conf)) {
+		if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) {
+			rtnl_unlock();
+			flags |= TS_AUTOLOAD;
+			goto retry;
+		} else
+			return PTR_ERR(ts_conf);
+	} else if (flags & TS_AUTOLOAD) {
+		textsearch_destroy(ts_conf);
+		return -EAGAIN;
+	}
+
+	tm = kmalloc(sizeof(*tm), GFP_KERNEL);
+	if (tm == NULL) {
+		textsearch_destroy(ts_conf);
+		return -ENOBUFS;
+	}
+
+	tm->from_offset = conf->from_offset;
+	tm->to_offset   = conf->to_offset;
+	tm->from_layer  = conf->from_layer;
+	tm->to_layer    = conf->to_layer;
+	tm->config      = ts_conf;
+
+	m->datalen = sizeof(*tm);
+	m->data = (unsigned long) tm;
+
+	return 0;
+}
+
+static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+	textsearch_destroy(EM_TEXT_PRIV(m)->config);
+}
+
+static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+	struct text_match *tm = EM_TEXT_PRIV(m);
+	struct tcf_em_text conf;
+
+	strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1);
+	conf.from_offset = tm->from_offset;
+	conf.to_offset = tm->to_offset;
+	conf.from_layer = tm->from_layer;
+	conf.to_layer = tm->to_layer;
+	conf.pattern_len = textsearch_get_pattern_len(tm->config);
+	conf.pad = 0;
+
+	RTA_PUT_NOHDR(skb, sizeof(conf), &conf);
+	RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config));
+	return 0;
+
+rtattr_failure:
+	return -1;
+}		
+
+static struct tcf_ematch_ops em_text_ops = {
+	.kind	  = TCF_EM_TEXT,
+	.change	  = em_text_change,
+	.match	  = em_text_match,
+	.destroy  = em_text_destroy,
+	.dump	  = em_text_dump,
+	.owner	  = THIS_MODULE,
+	.link	  = LIST_HEAD_INIT(em_text_ops.link)
+};
+
+static int __init init_em_text(void)
+{
+	return tcf_em_register(&em_text_ops);
+}
+
+static void __exit exit_em_text(void) 
+{
+	tcf_em_unregister(&em_text_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_text);
+module_exit(exit_em_text);
-- 
cgit v1.2.3-59-g8ed1b


From 76d8aeabfeb1c42641a81c44280177b9a08670d8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Jun 2005 22:00:49 -0700
Subject: [PATCH] keys: Discard key spinlock and use RCU for key payload

The attached patch changes the key implementation in a number of ways:

 (1) It removes the spinlock from the key structure.

 (2) The key flags are now accessed using atomic bitops instead of
     write-locking the key spinlock and using C bitwise operators.

     The three instantiation flags are dealt with with the construction
     semaphore held during the request_key/instantiate/negate sequence, thus
     rendering the spinlock superfluous.

     The key flags are also now bit numbers not bit masks.

 (3) The key payload is now accessed using RCU. This permits the recursive
     keyring search algorithm to be simplified greatly since no locks need be
     taken other than the usual RCU preemption disablement. Searching now does
     not require any locks or semaphores to be held; merely that the starting
     keyring be pinned.

 (4) The keyring payload now includes an RCU head so that it can be disposed
     of by call_rcu(). This requires that the payload be copied on unlink to
     prevent introducing races in copy-down vs search-up.

 (5) The user key payload is now a structure with the data following it. It
     includes an RCU head like the keyring payload and for the same reason. It
     also contains a data length because the data length in the key may be
     changed on another CPU whilst an RCU protected read is in progress on the
     payload. This would then see the supposed RCU payload and the on-key data
     length getting out of sync.

     I'm tempted to drop the key's datalen entirely, except that it's used in
     conjunction with quota management and so is a little tricky to get rid
     of.

 (6) Update the keys documentation.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt       | 285 ++++++++++++++++++++++++++-----------------
 include/linux/key-ui.h       |   6 +-
 include/linux/key.h          |  25 ++--
 security/keys/key.c          |  94 +++++++-------
 security/keys/keyctl.c       |  23 ++--
 security/keys/keyring.c      | 245 ++++++++++++++++++++++---------------
 security/keys/proc.c         |  21 ++--
 security/keys/process_keys.c |  12 +-
 security/keys/request_key.c  |  32 +++--
 security/keys/user_defined.c |  85 +++++++++----
 10 files changed, 480 insertions(+), 348 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 36d80aeeaf28..3df40c1fe15a 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -22,6 +22,7 @@ This document has the following sections:
 	- New procfs files
 	- Userspace system call interface
 	- Kernel services
+	- Notes on accessing payload contents
 	- Defining a key type
 	- Request-key callback service
 	- Key access filesystem
@@ -45,27 +46,26 @@ Each key has a number of attributes:
 	- State.
 
 
- (*) Each key is issued a serial number of type key_serial_t that is unique
-     for the lifetime of that key. All serial numbers are positive non-zero
-     32-bit integers.
+ (*) Each key is issued a serial number of type key_serial_t that is unique for
+     the lifetime of that key. All serial numbers are positive non-zero 32-bit
+     integers.
 
      Userspace programs can use a key's serial numbers as a way to gain access
      to it, subject to permission checking.
 
  (*) Each key is of a defined "type". Types must be registered inside the
-     kernel by a kernel service (such as a filesystem) before keys of that
-     type can be added or used. Userspace programs cannot define new types
-     directly.
+     kernel by a kernel service (such as a filesystem) before keys of that type
+     can be added or used. Userspace programs cannot define new types directly.
 
-     Key types are represented in the kernel by struct key_type. This defines
-     a number of operations that can be performed on a key of that type.
+     Key types are represented in the kernel by struct key_type. This defines a
+     number of operations that can be performed on a key of that type.
 
      Should a type be removed from the system, all the keys of that type will
      be invalidated.
 
  (*) Each key has a description. This should be a printable string. The key
-     type provides an operation to perform a match between the description on
-     a key and a criterion string.
+     type provides an operation to perform a match between the description on a
+     key and a criterion string.
 
  (*) Each key has an owner user ID, a group ID and a permissions mask. These
      are used to control what a process may do to a key from userspace, and
@@ -74,10 +74,10 @@ Each key has a number of attributes:
  (*) Each key can be set to expire at a specific time by the key type's
      instantiation function. Keys can also be immortal.
 
- (*) Each key can have a payload. This is a quantity of data that represent
-     the actual "key". In the case of a keyring, this is a list of keys to
-     which the keyring links; in the case of a user-defined key, it's an
-     arbitrary blob of data.
+ (*) Each key can have a payload. This is a quantity of data that represent the
+     actual "key". In the case of a keyring, this is a list of keys to which
+     the keyring links; in the case of a user-defined key, it's an arbitrary
+     blob of data.
 
      Having a payload is not required; and the payload can, in fact, just be a
      value stored in the struct key itself.
@@ -92,8 +92,8 @@ Each key has a number of attributes:
 
  (*) Each key can be in one of a number of basic states:
 
-     (*) Uninstantiated. The key exists, but does not have any data
-	 attached. Keys being requested from userspace will be in this state.
+     (*) Uninstantiated. The key exists, but does not have any data attached.
+     	 Keys being requested from userspace will be in this state.
 
      (*) Instantiated. This is the normal state. The key is fully formed, and
 	 has data attached.
@@ -140,10 +140,10 @@ The key service provides a number of features besides keys:
      clone, fork, vfork or execve occurs. A new keyring is created only when
      required.
 
-     The process-specific keyring is replaced with an empty one in the child
-     on clone, fork, vfork unless CLONE_THREAD is supplied, in which case it
-     is shared. execve also discards the process's process keyring and creates
-     a new one.
+     The process-specific keyring is replaced with an empty one in the child on
+     clone, fork, vfork unless CLONE_THREAD is supplied, in which case it is
+     shared. execve also discards the process's process keyring and creates a
+     new one.
 
      The session-specific keyring is persistent across clone, fork, vfork and
      execve, even when the latter executes a set-UID or set-GID binary. A
@@ -177,11 +177,11 @@ The key service provides a number of features besides keys:
      If a system call that modifies a key or keyring in some way would put the
      user over quota, the operation is refused and error EDQUOT is returned.
 
- (*) There's a system call interface by which userspace programs can create
-     and manipulate keys and keyrings.
+ (*) There's a system call interface by which userspace programs can create and
+     manipulate keys and keyrings.
 
- (*) There's a kernel interface by which services can register types and
-     search for keys.
+ (*) There's a kernel interface by which services can register types and search
+     for keys.
 
  (*) There's a way for the a search done from the kernel to call back to
      userspace to request a key that can't be found in a process's keyrings.
@@ -194,9 +194,9 @@ The key service provides a number of features besides keys:
 KEY ACCESS PERMISSIONS
 ======================
 
-Keys have an owner user ID, a group access ID, and a permissions mask. The
-mask has up to eight bits each for user, group and other access. Only five of
-each set of eight bits are defined. These permissions granted are:
+Keys have an owner user ID, a group access ID, and a permissions mask. The mask
+has up to eight bits each for user, group and other access. Only five of each
+set of eight bits are defined. These permissions granted are:
 
  (*) View
 
@@ -210,8 +210,8 @@ each set of eight bits are defined. These permissions granted are:
 
  (*) Write
 
-     This permits a key's payload to be instantiated or updated, or it allows
-     a link to be added to or removed from a keyring.
+     This permits a key's payload to be instantiated or updated, or it allows a
+     link to be added to or removed from a keyring.
 
  (*) Search
 
@@ -238,8 +238,8 @@ about the status of the key service:
  (*) /proc/keys
 
      This lists all the keys on the system, giving information about their
-     type, description and permissions. The payload of the key is not
-     available this way:
+     type, description and permissions. The payload of the key is not available
+     this way:
 
 	SERIAL   FLAGS  USAGE EXPY PERM   UID   GID   TYPE      DESCRIPTION: SUMMARY
 	00000001 I-----    39 perm 1f0000     0     0 keyring   _uid_ses.0: 1/4
@@ -318,21 +318,21 @@ The main syscalls are:
      If a key of the same type and description as that proposed already exists
      in the keyring, this will try to update it with the given payload, or it
      will return error EEXIST if that function is not supported by the key
-     type. The process must also have permission to write to the key to be
-     able to update it. The new key will have all user permissions granted and
-     no group or third party permissions.
+     type. The process must also have permission to write to the key to be able
+     to update it. The new key will have all user permissions granted and no
+     group or third party permissions.
 
-     Otherwise, this will attempt to create a new key of the specified type
-     and description, and to instantiate it with the supplied payload and
-     attach it to the keyring. In this case, an error will be generated if the
-     process does not have permission to write to the keyring.
+     Otherwise, this will attempt to create a new key of the specified type and
+     description, and to instantiate it with the supplied payload and attach it
+     to the keyring. In this case, an error will be generated if the process
+     does not have permission to write to the keyring.
 
      The payload is optional, and the pointer can be NULL if not required by
      the type. The payload is plen in size, and plen can be zero for an empty
      payload.
 
-     A new keyring can be generated by setting type "keyring", the keyring
-     name as the description (or NULL) and setting the payload to NULL.
+     A new keyring can be generated by setting type "keyring", the keyring name
+     as the description (or NULL) and setting the payload to NULL.
 
      User defined keys can be created by specifying type "user". It is
      recommended that a user defined key's description by prefixed with a type
@@ -369,9 +369,9 @@ The keyctl syscall functions are:
 	key_serial_t keyctl(KEYCTL_GET_KEYRING_ID, key_serial_t id,
 			    int create);
 
-     The special key specified by "id" is looked up (with the key being
-     created if necessary) and the ID of the key or keyring thus found is
-     returned if it exists.
+     The special key specified by "id" is looked up (with the key being created
+     if necessary) and the ID of the key or keyring thus found is returned if
+     it exists.
 
      If the key does not yet exist, the key will be created if "create" is
      non-zero; and the error ENOKEY will be returned if "create" is zero.
@@ -402,8 +402,8 @@ The keyctl syscall functions are:
 
      This will try to update the specified key with the given payload, or it
      will return error EOPNOTSUPP if that function is not supported by the key
-     type. The process must also have permission to write to the key to be
-     able to update it.
+     type. The process must also have permission to write to the key to be able
+     to update it.
 
      The payload is of length plen, and may be absent or empty as for
      add_key().
@@ -422,8 +422,8 @@ The keyctl syscall functions are:
 
 	long keyctl(KEYCTL_CHOWN, key_serial_t key, uid_t uid, gid_t gid);
 
-     This function permits a key's owner and group ID to be changed. Either
-     one of uid or gid can be set to -1 to suppress that change.
+     This function permits a key's owner and group ID to be changed. Either one
+     of uid or gid can be set to -1 to suppress that change.
 
      Only the superuser can change a key's owner to something other than the
      key's current owner. Similarly, only the superuser can change a key's
@@ -484,12 +484,12 @@ The keyctl syscall functions are:
 
 	long keyctl(KEYCTL_LINK, key_serial_t keyring, key_serial_t key);
 
-     This function creates a link from the keyring to the key. The process
-     must have write permission on the keyring and must have link permission
-     on the key.
+     This function creates a link from the keyring to the key. The process must
+     have write permission on the keyring and must have link permission on the
+     key.
 
-     Should the keyring not be a keyring, error ENOTDIR will result; and if
-     the keyring is full, error ENFILE will result.
+     Should the keyring not be a keyring, error ENOTDIR will result; and if the
+     keyring is full, error ENFILE will result.
 
      The link procedure checks the nesting of the keyrings, returning ELOOP if
      it appears to deep or EDEADLK if the link would introduce a cycle.
@@ -503,8 +503,8 @@ The keyctl syscall functions are:
      specified key, and removes it if found. Subsequent links to that key are
      ignored. The process must have write permission on the keyring.
 
-     If the keyring is not a keyring, error ENOTDIR will result; and if the
-     key is not present, error ENOENT will be the result.
+     If the keyring is not a keyring, error ENOTDIR will result; and if the key
+     is not present, error ENOENT will be the result.
 
 
  (*) Search a keyring tree for a key:
@@ -513,9 +513,9 @@ The keyctl syscall functions are:
 			    const char *type, const char *description,
 			    key_serial_t dest_keyring);
 
-     This searches the keyring tree headed by the specified keyring until a
-     key is found that matches the type and description criteria. Each keyring
-     is checked for keys before recursion into its children occurs.
+     This searches the keyring tree headed by the specified keyring until a key
+     is found that matches the type and description criteria. Each keyring is
+     checked for keys before recursion into its children occurs.
 
      The process must have search permission on the top level keyring, or else
      error EACCES will result. Only keyrings that the process has search
@@ -549,8 +549,8 @@ The keyctl syscall functions are:
      As much of the data as can be fitted into the buffer will be copied to
      userspace if the buffer pointer is not NULL.
 
-     On a successful return, the function will always return the amount of
-     data available rather than the amount copied.
+     On a successful return, the function will always return the amount of data
+     available rather than the amount copied.
 
 
  (*) Instantiate a partially constructed key.
@@ -568,8 +568,8 @@ The keyctl syscall functions are:
      it, and the key must be uninstantiated.
 
      If a keyring is specified (non-zero), the key will also be linked into
-     that keyring, however all the constraints applying in KEYCTL_LINK apply
-     in this case too.
+     that keyring, however all the constraints applying in KEYCTL_LINK apply in
+     this case too.
 
      The payload and plen arguments describe the payload data as for add_key().
 
@@ -587,8 +587,8 @@ The keyctl syscall functions are:
      it, and the key must be uninstantiated.
 
      If a keyring is specified (non-zero), the key will also be linked into
-     that keyring, however all the constraints applying in KEYCTL_LINK apply
-     in this case too.
+     that keyring, however all the constraints applying in KEYCTL_LINK apply in
+     this case too.
 
 
 ===============
@@ -601,17 +601,14 @@ be broken down into two areas: keys and key types.
 Dealing with keys is fairly straightforward. Firstly, the kernel service
 registers its type, then it searches for a key of that type. It should retain
 the key as long as it has need of it, and then it should release it. For a
-filesystem or device file, a search would probably be performed during the
-open call, and the key released upon close. How to deal with conflicting keys
-due to two different users opening the same file is left to the filesystem
-author to solve.
-
-When accessing a key's payload data, key->lock should be at least read locked,
-or else the data may be changed by an update being performed from userspace
-whilst the driver or filesystem is trying to access it. If no update method is
-supplied, then the key's payload may be accessed without holding a lock as
-there is no way to change it, provided it can be guaranteed that the key's
-type definition won't go away.
+filesystem or device file, a search would probably be performed during the open
+call, and the key released upon close. How to deal with conflicting keys due to
+two different users opening the same file is left to the filesystem author to
+solve.
+
+When accessing a key's payload contents, certain precautions must be taken to
+prevent access vs modification races. See the section "Notes on accessing
+payload contents" for more information.
 
 (*) To search for a key, call:
 
@@ -690,6 +687,54 @@ type definition won't go away.
 	void unregister_key_type(struct key_type *type);
 
 
+===================================
+NOTES ON ACCESSING PAYLOAD CONTENTS
+===================================
+
+The simplest payload is just a number in key->payload.value. In this case,
+there's no need to indulge in RCU or locking when accessing the payload.
+
+More complex payload contents must be allocated and a pointer to them set in
+key->payload.data. One of the following ways must be selected to access the
+data:
+
+ (1) Unmodifyable key type.
+
+     If the key type does not have a modify method, then the key's payload can
+     be accessed without any form of locking, provided that it's known to be
+     instantiated (uninstantiated keys cannot be "found").
+
+ (2) The key's semaphore.
+
+     The semaphore could be used to govern access to the payload and to control
+     the payload pointer. It must be write-locked for modifications and would
+     have to be read-locked for general access. The disadvantage of doing this
+     is that the accessor may be required to sleep.
+
+ (3) RCU.
+
+     RCU must be used when the semaphore isn't already held; if the semaphore
+     is held then the contents can't change under you unexpectedly as the
+     semaphore must still be used to serialise modifications to the key. The
+     key management code takes care of this for the key type.
+
+     However, this means using:
+
+	rcu_read_lock() ... rcu_dereference() ... rcu_read_unlock()
+
+     to read the pointer, and:
+
+	rcu_dereference() ... rcu_assign_pointer() ... call_rcu()
+
+     to set the pointer and dispose of the old contents after a grace period.
+     Note that only the key type should ever modify a key's payload.
+
+     Furthermore, an RCU controlled payload must hold a struct rcu_head for the
+     use of call_rcu() and, if the payload is of variable size, the length of
+     the payload. key->datalen cannot be relied upon to be consistent with the
+     payload just dereferenced if the key's semaphore is not held.
+
+
 ===================
 DEFINING A KEY TYPE
 ===================
@@ -717,15 +762,15 @@ The structure has a number of fields, some of which are mandatory:
 
 	int key_payload_reserve(struct key *key, size_t datalen);
 
-     With the revised data length. Error EDQUOT will be returned if this is
-     not viable.
+     With the revised data length. Error EDQUOT will be returned if this is not
+     viable.
 
 
  (*) int (*instantiate)(struct key *key, const void *data, size_t datalen);
 
      This method is called to attach a payload to a key during construction.
-     The payload attached need not bear any relation to the data passed to
-     this function.
+     The payload attached need not bear any relation to the data passed to this
+     function.
 
      If the amount of data attached to the key differs from the size in
      keytype->def_datalen, then key_payload_reserve() should be called.
@@ -734,38 +779,47 @@ The structure has a number of fields, some of which are mandatory:
      The fact that KEY_FLAG_INSTANTIATED is not set in key->flags prevents
      anything else from gaining access to the key.
 
-     This method may sleep if it wishes.
+     It is safe to sleep in this method.
 
 
  (*) int (*duplicate)(struct key *key, const struct key *source);
 
      If this type of key can be duplicated, then this method should be
-     provided. It is called to copy the payload attached to the source into
-     the new key. The data length on the new key will have been updated and
-     the quota adjusted already.
+     provided. It is called to copy the payload attached to the source into the
+     new key. The data length on the new key will have been updated and the
+     quota adjusted already.
 
      This method will be called with the source key's semaphore read-locked to
-     prevent its payload from being changed. It is safe to sleep here.
+     prevent its payload from being changed, thus RCU constraints need not be
+     applied to the source key.
+
+     This method does not have to lock the destination key in order to attach a
+     payload. The fact that KEY_FLAG_INSTANTIATED is not set in key->flags
+     prevents anything else from gaining access to the key.
+
+     It is safe to sleep in this method.
 
 
  (*) int (*update)(struct key *key, const void *data, size_t datalen);
 
-     If this type of key can be updated, then this method should be
-     provided. It is called to update a key's payload from the blob of data
-     provided.
+     If this type of key can be updated, then this method should be provided.
+     It is called to update a key's payload from the blob of data provided.
 
      key_payload_reserve() should be called if the data length might change
-     before any changes are actually made. Note that if this succeeds, the
-     type is committed to changing the key because it's already been altered,
-     so all memory allocation must be done first.
+     before any changes are actually made. Note that if this succeeds, the type
+     is committed to changing the key because it's already been altered, so all
+     memory allocation must be done first.
+
+     The key will have its semaphore write-locked before this method is called,
+     but this only deters other writers; any changes to the key's payload must
+     be made under RCU conditions, and call_rcu() must be used to dispose of
+     the old payload.
 
-     key_payload_reserve() should be called with the key->lock write locked,
-     and the changes to the key's attached payload should be made before the
-     key is locked.
+     key_payload_reserve() should be called before the changes are made, but
+     after all allocations and other potentially failing function calls are
+     made.
 
-     The key will have its semaphore write-locked before this method is
-     called. Any changes to the key should be made with the key's rwlock
-     write-locked also. It is safe to sleep here.
+     It is safe to sleep in this method.
 
 
  (*) int (*match)(const struct key *key, const void *desc);
@@ -782,12 +836,12 @@ The structure has a number of fields, some of which are mandatory:
 
  (*) void (*destroy)(struct key *key);
 
-     This method is optional. It is called to discard the payload data on a
-     key when it is being destroyed.
+     This method is optional. It is called to discard the payload data on a key
+     when it is being destroyed.
 
-     This method does not need to lock the key; it can consider the key as
-     being inaccessible. Note that the key's type may have changed before this
-     function is called.
+     This method does not need to lock the key to access the payload; it can
+     consider the key as being inaccessible at this time. Note that the key's
+     type may have been changed before this function is called.
 
      It is not safe to sleep in this method; the caller may hold spinlocks.
 
@@ -797,26 +851,31 @@ The structure has a number of fields, some of which are mandatory:
      This method is optional. It is called during /proc/keys reading to
      summarise a key's description and payload in text form.
 
-     This method will be called with the key's rwlock read-locked. This will
-     prevent the key's payload and state changing; also the description should
-     not change. This also means it is not safe to sleep in this method.
+     This method will be called with the RCU read lock held. rcu_dereference()
+     should be used to read the payload pointer if the payload is to be
+     accessed. key->datalen cannot be trusted to stay consistent with the
+     contents of the payload.
+
+     The description will not change, though the key's state may.
+
+     It is not safe to sleep in this method; the RCU read lock is held by the
+     caller.
 
 
  (*) long (*read)(const struct key *key, char __user *buffer, size_t buflen);
 
      This method is optional. It is called by KEYCTL_READ to translate the
-     key's payload into something a blob of data for userspace to deal
-     with. Ideally, the blob should be in the same format as that passed in to
-     the instantiate and update methods.
+     key's payload into something a blob of data for userspace to deal with.
+     Ideally, the blob should be in the same format as that passed in to the
+     instantiate and update methods.
 
      If successful, the blob size that could be produced should be returned
      rather than the size copied.
 
-     This method will be called with the key's semaphore read-locked. This
-     will prevent the key's payload changing. It is not necessary to also
-     read-lock key->lock when accessing the key's payload. It is safe to sleep
-     in this method, such as might happen when the userspace buffer is
-     accessed.
+     This method will be called with the key's semaphore read-locked. This will
+     prevent the key's payload changing. It is not necessary to use RCU locking
+     when accessing the key's payload. It is safe to sleep in this method, such
+     as might happen when the userspace buffer is accessed.
 
 
 ============================
@@ -853,8 +912,8 @@ If it returns with the key remaining in the unconstructed state, the key will
 be marked as being negative, it will be added to the session keyring, and an
 error will be returned to the key requestor.
 
-Supplementary information may be provided from whoever or whatever invoked
-this service. This will be passed as the <callout_info> parameter. If no such
+Supplementary information may be provided from whoever or whatever invoked this
+service. This will be passed as the <callout_info> parameter. If no such
 information was made available, then "-" will be passed as this parameter
 instead.
 
diff --git a/include/linux/key-ui.h b/include/linux/key-ui.h
index 60cc7b762e78..159ca8d54e9a 100644
--- a/include/linux/key-ui.h
+++ b/include/linux/key-ui.h
@@ -31,8 +31,10 @@ extern spinlock_t key_serial_lock;
  * subscribed
  */
 struct keyring_list {
-	unsigned	maxkeys;	/* max keys this list can hold */
-	unsigned	nkeys;		/* number of keys currently held */
+	struct rcu_head	rcu;		/* RCU deletion hook */
+	unsigned short	maxkeys;	/* max keys this list can hold */
+	unsigned short	nkeys;		/* number of keys currently held */
+	unsigned short	delkey;		/* key to be unlinked by RCU */
 	struct key	*keys[0];
 };
 
diff --git a/include/linux/key.h b/include/linux/key.h
index 6aa46d0e812f..2c24ffaca86f 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -18,7 +18,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/rbtree.h>
-#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 #include <asm/atomic.h>
 
 #ifdef __KERNEL__
@@ -78,7 +78,6 @@ struct key {
 	key_serial_t		serial;		/* key serial number */
 	struct rb_node		serial_node;
 	struct key_type		*type;		/* type of key */
-	rwlock_t		lock;		/* examination vs change lock */
 	struct rw_semaphore	sem;		/* change vs change sem */
 	struct key_user		*user;		/* owner of this key */
 	time_t			expiry;		/* time at which key expires (or 0) */
@@ -86,14 +85,10 @@ struct key {
 	gid_t			gid;
 	key_perm_t		perm;		/* access permissions */
 	unsigned short		quotalen;	/* length added to quota */
-	unsigned short		datalen;	/* payload data length */
-	unsigned short		flags;		/* status flags (change with lock writelocked) */
-#define KEY_FLAG_INSTANTIATED	0x00000001	/* set if key has been instantiated */
-#define KEY_FLAG_DEAD		0x00000002	/* set if key type has been deleted */
-#define KEY_FLAG_REVOKED	0x00000004	/* set if key had been revoked */
-#define KEY_FLAG_IN_QUOTA	0x00000008	/* set if key consumes quota */
-#define KEY_FLAG_USER_CONSTRUCT	0x00000010	/* set if key is being constructed in userspace */
-#define KEY_FLAG_NEGATIVE	0x00000020	/* set if key is negative */
+	unsigned short		datalen;	/* payload data length
+						 * - may not match RCU dereferenced payload
+						 * - payload should contain own length
+						 */
 
 #ifdef KEY_DEBUGGING
 	unsigned		magic;
@@ -101,6 +96,14 @@ struct key {
 #define KEY_DEBUG_MAGIC_X	0xf8e9dacbu
 #endif
 
+	unsigned long		flags;		/* status flags (change with bitops) */
+#define KEY_FLAG_INSTANTIATED	0	/* set if key has been instantiated */
+#define KEY_FLAG_DEAD		1	/* set if key type has been deleted */
+#define KEY_FLAG_REVOKED	2	/* set if key had been revoked */
+#define KEY_FLAG_IN_QUOTA	3	/* set if key consumes quota */
+#define KEY_FLAG_USER_CONSTRUCT	4	/* set if key is being constructed in userspace */
+#define KEY_FLAG_NEGATIVE	5	/* set if key is negative */
+
 	/* the description string
 	 * - this is used to match a key against search criteria
 	 * - this should be a printable string
@@ -250,6 +253,8 @@ extern int keyring_add_key(struct key *keyring,
 
 extern struct key *key_lookup(key_serial_t id);
 
+extern void keyring_replace_payload(struct key *key, void *replacement);
+
 #define key_serial(key) ((key) ? (key)->serial : 0)
 
 /*
diff --git a/security/keys/key.c b/security/keys/key.c
index 59402c843203..1fdfccb3fe43 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -294,7 +294,6 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	}
 
 	atomic_set(&key->usage, 1);
-	rwlock_init(&key->lock);
 	init_rwsem(&key->sem);
 	key->type = type;
 	key->user = user;
@@ -308,7 +307,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	key->payload.data = NULL;
 
 	if (!not_in_quota)
-		key->flags |= KEY_FLAG_IN_QUOTA;
+		key->flags |= 1 << KEY_FLAG_IN_QUOTA;
 
 	memset(&key->type_data, 0, sizeof(key->type_data));
 
@@ -359,7 +358,7 @@ int key_payload_reserve(struct key *key, size_t datalen)
 	key_check(key);
 
 	/* contemplate the quota adjustment */
-	if (delta != 0 && key->flags & KEY_FLAG_IN_QUOTA) {
+	if (delta != 0 && test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
 		spin_lock(&key->user->lock);
 
 		if (delta > 0 &&
@@ -405,23 +404,17 @@ static int __key_instantiate_and_link(struct key *key,
 	down_write(&key_construction_sem);
 
 	/* can't instantiate twice */
-	if (!(key->flags & KEY_FLAG_INSTANTIATED)) {
+	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) {
 		/* instantiate the key */
 		ret = key->type->instantiate(key, data, datalen);
 
 		if (ret == 0) {
 			/* mark the key as being instantiated */
-			write_lock(&key->lock);
-
 			atomic_inc(&key->user->nikeys);
-			key->flags |= KEY_FLAG_INSTANTIATED;
+			set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
 
-			if (key->flags & KEY_FLAG_USER_CONSTRUCT) {
-				key->flags &= ~KEY_FLAG_USER_CONSTRUCT;
+			if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
 				awaken = 1;
-			}
-
-			write_unlock(&key->lock);
 
 			/* and link it into the destination keyring */
 			if (keyring)
@@ -486,21 +479,17 @@ int key_negate_and_link(struct key *key,
 	down_write(&key_construction_sem);
 
 	/* can't instantiate twice */
-	if (!(key->flags & KEY_FLAG_INSTANTIATED)) {
+	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) {
 		/* mark the key as being negatively instantiated */
-		write_lock(&key->lock);
-
 		atomic_inc(&key->user->nikeys);
-		key->flags |= KEY_FLAG_INSTANTIATED | KEY_FLAG_NEGATIVE;
+		set_bit(KEY_FLAG_NEGATIVE, &key->flags);
+		set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
 		now = current_kernel_time();
 		key->expiry = now.tv_sec + timeout;
 
-		if (key->flags & KEY_FLAG_USER_CONSTRUCT) {
-			key->flags &= ~KEY_FLAG_USER_CONSTRUCT;
+		if (test_and_clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags))
 			awaken = 1;
-		}
 
-		write_unlock(&key->lock);
 		ret = 0;
 
 		/* and link it into the destination keyring */
@@ -553,8 +542,10 @@ static void key_cleanup(void *data)
 	rb_erase(&key->serial_node, &key_serial_tree);
 	spin_unlock(&key_serial_lock);
 
+	key_check(key);
+
 	/* deal with the user's key tracking and quota */
-	if (key->flags & KEY_FLAG_IN_QUOTA) {
+	if (test_bit(KEY_FLAG_IN_QUOTA, &key->flags)) {
 		spin_lock(&key->user->lock);
 		key->user->qnkeys--;
 		key->user->qnbytes -= key->quotalen;
@@ -562,7 +553,7 @@ static void key_cleanup(void *data)
 	}
 
 	atomic_dec(&key->user->nkeys);
-	if (key->flags & KEY_FLAG_INSTANTIATED)
+	if (test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		atomic_dec(&key->user->nikeys);
 
 	key_user_put(key->user);
@@ -631,9 +622,9 @@ struct key *key_lookup(key_serial_t id)
 	goto error;
 
  found:
-	/* pretent doesn't exist if it's dead */
+	/* pretend it doesn't exist if it's dead */
 	if (atomic_read(&key->usage) == 0 ||
-	    (key->flags & KEY_FLAG_DEAD) ||
+	    test_bit(KEY_FLAG_DEAD, &key->flags) ||
 	    key->type == &key_type_dead)
 		goto not_found;
 
@@ -708,12 +699,9 @@ static inline struct key *__key_update(struct key *key, const void *payload,
 
 	ret = key->type->update(key, payload, plen);
 
-	if (ret == 0) {
+	if (ret == 0)
 		/* updating a negative key instantiates it */
-		write_lock(&key->lock);
-		key->flags &= ~KEY_FLAG_NEGATIVE;
-		write_unlock(&key->lock);
-	}
+		clear_bit(KEY_FLAG_NEGATIVE, &key->flags);
 
 	up_write(&key->sem);
 
@@ -841,12 +829,9 @@ int key_update(struct key *key, const void *payload, size_t plen)
 		down_write(&key->sem);
 		ret = key->type->update(key, payload, plen);
 
-		if (ret == 0) {
+		if (ret == 0)
 			/* updating a negative key instantiates it */
-			write_lock(&key->lock);
-			key->flags &= ~KEY_FLAG_NEGATIVE;
-			write_unlock(&key->lock);
-		}
+			clear_bit(KEY_FLAG_NEGATIVE, &key->flags);
 
 		up_write(&key->sem);
 	}
@@ -892,10 +877,7 @@ struct key *key_duplicate(struct key *source, const char *desc)
 		goto error2;
 
 	atomic_inc(&key->user->nikeys);
-
-	write_lock(&key->lock);
-	key->flags |= KEY_FLAG_INSTANTIATED;
-	write_unlock(&key->lock);
+	set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
 
  error_k:
 	up_read(&key_types_sem);
@@ -922,9 +904,7 @@ void key_revoke(struct key *key)
 	/* make sure no one's trying to change or use the key when we mark
 	 * it */
 	down_write(&key->sem);
-	write_lock(&key->lock);
-	key->flags |= KEY_FLAG_REVOKED;
-	write_unlock(&key->lock);
+	set_bit(KEY_FLAG_REVOKED, &key->flags);
 	up_write(&key->sem);
 
 } /* end key_revoke() */
@@ -975,24 +955,33 @@ void unregister_key_type(struct key_type *ktype)
 	/* withdraw the key type */
 	list_del_init(&ktype->link);
 
-	/* need to withdraw all keys of this type */
+	/* mark all the keys of this type dead */
 	spin_lock(&key_serial_lock);
 
 	for (_n = rb_first(&key_serial_tree); _n; _n = rb_next(_n)) {
 		key = rb_entry(_n, struct key, serial_node);
 
-		if (key->type != ktype)
-			continue;
+		if (key->type == ktype)
+			key->type = &key_type_dead;
+	}
+
+	spin_unlock(&key_serial_lock);
+
+	/* make sure everyone revalidates their keys */
+	synchronize_kernel();
+
+	/* we should now be able to destroy the payloads of all the keys of
+	 * this type with impunity */
+	spin_lock(&key_serial_lock);
 
-		write_lock(&key->lock);
-		key->type = &key_type_dead;
-		write_unlock(&key->lock);
+	for (_n = rb_first(&key_serial_tree); _n; _n = rb_next(_n)) {
+		key = rb_entry(_n, struct key, serial_node);
 
-		/* there shouldn't be anyone looking at the description or
-		 * payload now */
-		if (ktype->destroy)
-			ktype->destroy(key);
-		memset(&key->payload, 0xbd, sizeof(key->payload));
+		if (key->type == ktype) {
+			if (ktype->destroy)
+				ktype->destroy(key);
+			memset(&key->payload, 0xbd, sizeof(key->payload));
+		}
 	}
 
 	spin_unlock(&key_serial_lock);
@@ -1037,4 +1026,5 @@ void __init key_init(void)
 
 	/* link the two root keyrings together */
 	key_link(&root_session_keyring, &root_user_keyring);
+
 } /* end key_init() */
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index dc0011b3fac9..cedb7326de29 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -728,7 +728,6 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	/* make the changes with the locks held to prevent chown/chown races */
 	ret = -EACCES;
 	down_write(&key->sem);
-	write_lock(&key->lock);
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		/* only the sysadmin can chown a key to some other UID */
@@ -755,7 +754,6 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	ret = 0;
 
  no_access:
-	write_unlock(&key->lock);
 	up_write(&key->sem);
 	key_put(key);
  error:
@@ -784,26 +782,19 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 		goto error;
 	}
 
-	/* make the changes with the locks held to prevent chown/chmod
-	 * races */
+	/* make the changes with the locks held to prevent chown/chmod races */
 	ret = -EACCES;
 	down_write(&key->sem);
-	write_lock(&key->lock);
 
-	/* if we're not the sysadmin, we can only chmod a key that we
-	 * own */
-	if (!capable(CAP_SYS_ADMIN) && key->uid != current->fsuid)
-		goto no_access;
-
-	/* changing the permissions mask */
-	key->perm = perm;
-	ret = 0;
+	/* if we're not the sysadmin, we can only change a key that we own */
+	if (capable(CAP_SYS_ADMIN) || key->uid == current->fsuid) {
+		key->perm = perm;
+		ret = 0;
+	}
 
- no_access:
-	write_unlock(&key->lock);
 	up_write(&key->sem);
 	key_put(key);
- error:
+error:
 	return ret;
 
 } /* end keyctl_setperm_key() */
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index e2ab4f8e7481..c9a5de197487 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -132,10 +132,17 @@ static int keyring_duplicate(struct key *keyring, const struct key *source)
 		(PAGE_SIZE - sizeof(*klist)) / sizeof(struct key);
 
 	ret = 0;
-	sklist = source->payload.subscriptions;
 
-	if (sklist && sklist->nkeys > 0) {
+	/* find out how many keys are currently linked */
+	rcu_read_lock();
+	sklist = rcu_dereference(source->payload.subscriptions);
+	max = 0;
+	if (sklist)
 		max = sklist->nkeys;
+	rcu_read_unlock();
+
+	/* allocate a new payload and stuff load with key links */
+	if (max > 0) {
 		BUG_ON(max > limit);
 
 		max = (max + 3) & ~3;
@@ -148,6 +155,10 @@ static int keyring_duplicate(struct key *keyring, const struct key *source)
 		if (!klist)
 			goto error;
 
+		/* set links */
+		rcu_read_lock();
+		sklist = rcu_dereference(source->payload.subscriptions);
+
 		klist->maxkeys = max;
 		klist->nkeys = sklist->nkeys;
 		memcpy(klist->keys,
@@ -157,7 +168,9 @@ static int keyring_duplicate(struct key *keyring, const struct key *source)
 		for (loop = klist->nkeys - 1; loop >= 0; loop--)
 			atomic_inc(&klist->keys[loop]->usage);
 
-		keyring->payload.subscriptions = klist;
+		rcu_read_unlock();
+
+		rcu_assign_pointer(keyring->payload.subscriptions, klist);
 		ret = 0;
 	}
 
@@ -192,7 +205,7 @@ static void keyring_destroy(struct key *keyring)
 		write_unlock(&keyring_name_lock);
 	}
 
-	klist = keyring->payload.subscriptions;
+	klist = rcu_dereference(keyring->payload.subscriptions);
 	if (klist) {
 		for (loop = klist->nkeys - 1; loop >= 0; loop--)
 			key_put(klist->keys[loop]);
@@ -216,17 +229,20 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m)
 		seq_puts(m, "[anon]");
 	}
 
-	klist = keyring->payload.subscriptions;
+	rcu_read_lock();
+	klist = rcu_dereference(keyring->payload.subscriptions);
 	if (klist)
 		seq_printf(m, ": %u/%u", klist->nkeys, klist->maxkeys);
 	else
 		seq_puts(m, ": empty");
+	rcu_read_unlock();
 
 } /* end keyring_describe() */
 
 /*****************************************************************************/
 /*
  * read a list of key IDs from the keyring's contents
+ * - the keyring's semaphore is read-locked
  */
 static long keyring_read(const struct key *keyring,
 			 char __user *buffer, size_t buflen)
@@ -237,7 +253,7 @@ static long keyring_read(const struct key *keyring,
 	int loop, ret;
 
 	ret = 0;
-	klist = keyring->payload.subscriptions;
+	klist = rcu_dereference(keyring->payload.subscriptions);
 
 	if (klist) {
 		/* calculate how much data we could return */
@@ -320,7 +336,7 @@ struct key *keyring_search_aux(struct key *keyring,
 			       key_match_func_t match)
 {
 	struct {
-		struct key *keyring;
+		struct keyring_list *keylist;
 		int kix;
 	} stack[KEYRING_SEARCH_MAX_DEPTH];
 
@@ -328,10 +344,12 @@ struct key *keyring_search_aux(struct key *keyring,
 	struct timespec now;
 	struct key *key;
 	long err;
-	int sp, psp, kix;
+	int sp, kix;
 
 	key_check(keyring);
 
+	rcu_read_lock();
+
 	/* top keyring must have search permission to begin the search */
 	key = ERR_PTR(-EACCES);
 	if (!key_permission(keyring, KEY_SEARCH))
@@ -347,11 +365,10 @@ struct key *keyring_search_aux(struct key *keyring,
 
 	/* start processing a new keyring */
  descend:
-	read_lock(&keyring->lock);
-	if (keyring->flags & KEY_FLAG_REVOKED)
+	if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 		goto not_this_keyring;
 
-	keylist = keyring->payload.subscriptions;
+	keylist = rcu_dereference(keyring->payload.subscriptions);
 	if (!keylist)
 		goto not_this_keyring;
 
@@ -364,7 +381,7 @@ struct key *keyring_search_aux(struct key *keyring,
 			continue;
 
 		/* skip revoked keys and expired keys */
-		if (key->flags & KEY_FLAG_REVOKED)
+		if (test_bit(KEY_FLAG_REVOKED, &key->flags))
 			continue;
 
 		if (key->expiry && now.tv_sec >= key->expiry)
@@ -379,7 +396,7 @@ struct key *keyring_search_aux(struct key *keyring,
 			continue;
 
 		/* we set a different error code if we find a negative key */
-		if (key->flags & KEY_FLAG_NEGATIVE) {
+		if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
 			err = -ENOKEY;
 			continue;
 		}
@@ -390,48 +407,37 @@ struct key *keyring_search_aux(struct key *keyring,
 	/* search through the keyrings nested in this one */
 	kix = 0;
  ascend:
-	while (kix < keylist->nkeys) {
+	for (; kix < keylist->nkeys; kix++) {
 		key = keylist->keys[kix];
 		if (key->type != &key_type_keyring)
-			goto next;
+			continue;
 
 		/* recursively search nested keyrings
 		 * - only search keyrings for which we have search permission
 		 */
 		if (sp >= KEYRING_SEARCH_MAX_DEPTH)
-			goto next;
+			continue;
 
 		if (!key_permission(key, KEY_SEARCH))
-			goto next;
-
-		/* evade loops in the keyring tree */
-		for (psp = 0; psp < sp; psp++)
-			if (stack[psp].keyring == keyring)
-				goto next;
+			continue;
 
 		/* stack the current position */
-		stack[sp].keyring = keyring;
+		stack[sp].keylist = keylist;
 		stack[sp].kix = kix;
 		sp++;
 
 		/* begin again with the new keyring */
 		keyring = key;
 		goto descend;
-
-	next:
-		kix++;
 	}
 
 	/* the keyring we're looking at was disqualified or didn't contain a
 	 * matching key */
  not_this_keyring:
-	read_unlock(&keyring->lock);
-
 	if (sp > 0) {
 		/* resume the processing of a keyring higher up in the tree */
 		sp--;
-		keyring = stack[sp].keyring;
-		keylist = keyring->payload.subscriptions;
+		keylist = stack[sp].keylist;
 		kix = stack[sp].kix + 1;
 		goto ascend;
 	}
@@ -442,16 +448,9 @@ struct key *keyring_search_aux(struct key *keyring,
 	/* we found a viable match */
  found:
 	atomic_inc(&key->usage);
-	read_unlock(&keyring->lock);
-
-	/* unwind the keyring stack */
-	while (sp > 0) {
-		sp--;
-		read_unlock(&stack[sp].keyring->lock);
-	}
-
 	key_check(key);
  error:
+	rcu_read_unlock();
 	return key;
 
 } /* end keyring_search_aux() */
@@ -489,7 +488,9 @@ struct key *__keyring_search_one(struct key *keyring,
 	struct key *key;
 	int loop;
 
-	klist = keyring->payload.subscriptions;
+	rcu_read_lock();
+
+	klist = rcu_dereference(keyring->payload.subscriptions);
 	if (klist) {
 		for (loop = 0; loop < klist->nkeys; loop++) {
 			key = klist->keys[loop];
@@ -497,7 +498,7 @@ struct key *__keyring_search_one(struct key *keyring,
 			if (key->type == ktype &&
 			    key->type->match(key, description) &&
 			    key_permission(key, perm) &&
-			    !(key->flags & KEY_FLAG_REVOKED)
+			    !test_bit(KEY_FLAG_REVOKED, &key->flags)
 			    )
 				goto found;
 		}
@@ -509,6 +510,7 @@ struct key *__keyring_search_one(struct key *keyring,
  found:
 	atomic_inc(&key->usage);
  error:
+	rcu_read_unlock();
 	return key;
 
 } /* end __keyring_search_one() */
@@ -540,7 +542,7 @@ struct key *find_keyring_by_name(const char *name, key_serial_t bound)
 				    &keyring_name_hash[bucket],
 				    type_data.link
 				    ) {
-			if (keyring->flags & KEY_FLAG_REVOKED)
+			if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 				continue;
 
 			if (strcmp(keyring->description, name) != 0)
@@ -579,7 +581,7 @@ struct key *find_keyring_by_name(const char *name, key_serial_t bound)
 static int keyring_detect_cycle(struct key *A, struct key *B)
 {
 	struct {
-		struct key *subtree;
+		struct keyring_list *keylist;
 		int kix;
 	} stack[KEYRING_SEARCH_MAX_DEPTH];
 
@@ -587,20 +589,21 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 	struct key *subtree, *key;
 	int sp, kix, ret;
 
+	rcu_read_lock();
+
 	ret = -EDEADLK;
 	if (A == B)
-		goto error;
+		goto cycle_detected;
 
 	subtree = B;
 	sp = 0;
 
 	/* start processing a new keyring */
  descend:
-	read_lock(&subtree->lock);
-	if (subtree->flags & KEY_FLAG_REVOKED)
+	if (test_bit(KEY_FLAG_REVOKED, &subtree->flags))
 		goto not_this_keyring;
 
-	keylist = subtree->payload.subscriptions;
+	keylist = rcu_dereference(subtree->payload.subscriptions);
 	if (!keylist)
 		goto not_this_keyring;
 	kix = 0;
@@ -619,7 +622,7 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 				goto too_deep;
 
 			/* stack the current position */
-			stack[sp].subtree = subtree;
+			stack[sp].keylist = keylist;
 			stack[sp].kix = kix;
 			sp++;
 
@@ -632,13 +635,10 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 	/* the keyring we're looking at was disqualified or didn't contain a
 	 * matching key */
  not_this_keyring:
-	read_unlock(&subtree->lock);
-
 	if (sp > 0) {
 		/* resume the checking of a keyring higher up in the tree */
 		sp--;
-		subtree = stack[sp].subtree;
-		keylist = subtree->payload.subscriptions;
+		keylist = stack[sp].keylist;
 		kix = stack[sp].kix + 1;
 		goto ascend;
 	}
@@ -646,30 +646,36 @@ static int keyring_detect_cycle(struct key *A, struct key *B)
 	ret = 0; /* no cycles detected */
 
  error:
+	rcu_read_unlock();
 	return ret;
 
  too_deep:
 	ret = -ELOOP;
-	goto error_unwind;
+	goto error;
+
  cycle_detected:
 	ret = -EDEADLK;
- error_unwind:
-	read_unlock(&subtree->lock);
-
-	/* unwind the keyring stack */
-	while (sp > 0) {
-		sp--;
-		read_unlock(&stack[sp].subtree->lock);
-	}
-
 	goto error;
 
 } /* end keyring_detect_cycle() */
 
+/*****************************************************************************/
+/*
+ * dispose of a keyring list after the RCU grace period
+ */
+static void keyring_link_rcu_disposal(struct rcu_head *rcu)
+{
+	struct keyring_list *klist =
+		container_of(rcu, struct keyring_list, rcu);
+
+	kfree(klist);
+
+} /* end keyring_link_rcu_disposal() */
+
 /*****************************************************************************/
 /*
  * link a key into to a keyring
- * - must be called with the keyring's semaphore held
+ * - must be called with the keyring's semaphore write-locked
  */
 int __key_link(struct key *keyring, struct key *key)
 {
@@ -679,7 +685,7 @@ int __key_link(struct key *keyring, struct key *key)
 	int ret;
 
 	ret = -EKEYREVOKED;
-	if (keyring->flags & KEY_FLAG_REVOKED)
+	if (test_bit(KEY_FLAG_REVOKED, &keyring->flags))
 		goto error;
 
 	ret = -ENOTDIR;
@@ -710,9 +716,10 @@ int __key_link(struct key *keyring, struct key *key)
 		/* there's sufficient slack space to add directly */
 		atomic_inc(&key->usage);
 
-		write_lock(&keyring->lock);
-		klist->keys[klist->nkeys++] = key;
-		write_unlock(&keyring->lock);
+		klist->keys[klist->nkeys] = key;
+		smp_wmb();
+		klist->nkeys++;
+		smp_wmb();
 
 		ret = 0;
 	}
@@ -723,6 +730,8 @@ int __key_link(struct key *keyring, struct key *key)
 			max += klist->maxkeys;
 
 		ret = -ENFILE;
+		if (max > 65535)
+			goto error3;
 		size = sizeof(*klist) + sizeof(*key) * max;
 		if (size > PAGE_SIZE)
 			goto error3;
@@ -743,14 +752,13 @@ int __key_link(struct key *keyring, struct key *key)
 
 		/* add the key into the new space */
 		atomic_inc(&key->usage);
-
-		write_lock(&keyring->lock);
-		keyring->payload.subscriptions = nklist;
 		nklist->keys[nklist->nkeys++] = key;
-		write_unlock(&keyring->lock);
+
+		rcu_assign_pointer(keyring->payload.subscriptions, nklist);
 
 		/* dispose of the old keyring list */
-		kfree(klist);
+		if (klist)
+			call_rcu(&klist->rcu, keyring_link_rcu_disposal);
 
 		ret = 0;
 	}
@@ -789,13 +797,28 @@ int key_link(struct key *keyring, struct key *key)
 
 EXPORT_SYMBOL(key_link);
 
+/*****************************************************************************/
+/*
+ * dispose of a keyring list after the RCU grace period, freeing the unlinked
+ * key
+ */
+static void keyring_unlink_rcu_disposal(struct rcu_head *rcu)
+{
+	struct keyring_list *klist =
+		container_of(rcu, struct keyring_list, rcu);
+
+	key_put(klist->keys[klist->delkey]);
+	kfree(klist);
+
+} /* end keyring_unlink_rcu_disposal() */
+
 /*****************************************************************************/
 /*
  * unlink the first link to a key from a keyring
  */
 int key_unlink(struct key *keyring, struct key *key)
 {
-	struct keyring_list *klist;
+	struct keyring_list *klist, *nklist;
 	int loop, ret;
 
 	key_check(keyring);
@@ -819,36 +842,69 @@ int key_unlink(struct key *keyring, struct key *key)
 	ret = -ENOENT;
 	goto error;
 
- key_is_present:
+key_is_present:
+	/* we need to copy the key list for RCU purposes */
+	nklist = kmalloc(sizeof(*klist) + sizeof(*key) * klist->maxkeys,
+			 GFP_KERNEL);
+	if (!nklist)
+		goto nomem;
+	nklist->maxkeys = klist->maxkeys;
+	nklist->nkeys = klist->nkeys - 1;
+
+	if (loop > 0)
+		memcpy(&nklist->keys[0],
+		       &klist->keys[0],
+		       loop * sizeof(klist->keys[0]));
+
+	if (loop < nklist->nkeys)
+		memcpy(&nklist->keys[loop],
+		       &klist->keys[loop + 1],
+		       (nklist->nkeys - loop) * sizeof(klist->keys[0]));
+
 	/* adjust the user's quota */
 	key_payload_reserve(keyring,
 			    keyring->datalen - KEYQUOTA_LINK_BYTES);
 
-	/* shuffle down the key pointers
-	 * - it might be worth shrinking the allocated memory, but that runs
-	 *   the risk of ENOMEM as we would have to copy
-	 */
-	write_lock(&keyring->lock);
+	rcu_assign_pointer(keyring->payload.subscriptions, nklist);
 
-	klist->nkeys--;
-	if (loop < klist->nkeys)
-		memcpy(&klist->keys[loop],
-		       &klist->keys[loop + 1],
-		       (klist->nkeys - loop) * sizeof(struct key *));
+	up_write(&keyring->sem);
 
-	write_unlock(&keyring->lock);
+	/* schedule for later cleanup */
+	klist->delkey = loop;
+	call_rcu(&klist->rcu, keyring_unlink_rcu_disposal);
 
-	up_write(&keyring->sem);
-	key_put(key);
 	ret = 0;
 
- error:
+error:
 	return ret;
+nomem:
+	ret = -ENOMEM;
+	up_write(&keyring->sem);
+	goto error;
 
 } /* end key_unlink() */
 
 EXPORT_SYMBOL(key_unlink);
 
+/*****************************************************************************/
+/*
+ * dispose of a keyring list after the RCU grace period, releasing the keys it
+ * links to
+ */
+static void keyring_clear_rcu_disposal(struct rcu_head *rcu)
+{
+	struct keyring_list *klist;
+	int loop;
+
+	klist = container_of(rcu, struct keyring_list, rcu);
+
+	for (loop = klist->nkeys - 1; loop >= 0; loop--)
+		key_put(klist->keys[loop]);
+
+	kfree(klist);
+
+} /* end keyring_clear_rcu_disposal() */
+
 /*****************************************************************************/
 /*
  * clear the specified process keyring
@@ -857,7 +913,7 @@ EXPORT_SYMBOL(key_unlink);
 int keyring_clear(struct key *keyring)
 {
 	struct keyring_list *klist;
-	int loop, ret;
+	int ret;
 
 	ret = -ENOTDIR;
 	if (keyring->type == &key_type_keyring) {
@@ -870,20 +926,15 @@ int keyring_clear(struct key *keyring)
 			key_payload_reserve(keyring,
 					    sizeof(struct keyring_list));
 
-			write_lock(&keyring->lock);
-			keyring->payload.subscriptions = NULL;
-			write_unlock(&keyring->lock);
+			rcu_assign_pointer(keyring->payload.subscriptions,
+					   NULL);
 		}
 
 		up_write(&keyring->sem);
 
 		/* free the keys after the locks have been dropped */
-		if (klist) {
-			for (loop = klist->nkeys - 1; loop >= 0; loop--)
-				key_put(klist->keys[loop]);
-
-			kfree(klist);
-		}
+		if (klist)
+			call_rcu(&klist->rcu, keyring_clear_rcu_disposal);
 
 		ret = 0;
 	}
diff --git a/security/keys/proc.c b/security/keys/proc.c
index 91343b85c39c..c55cf1fd0826 100644
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -140,7 +140,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 
 	now = current_kernel_time();
 
-	read_lock(&key->lock);
+	rcu_read_lock();
 
 	/* come up with a suitable timeout value */
 	if (key->expiry == 0) {
@@ -164,14 +164,17 @@ static int proc_keys_show(struct seq_file *m, void *v)
 			sprintf(xbuf, "%luw", timo / (60*60*24*7));
 	}
 
+#define showflag(KEY, LETTER, FLAG) \
+	(test_bit(FLAG,	&(KEY)->flags) ? LETTER : '-')
+
 	seq_printf(m, "%08x %c%c%c%c%c%c %5d %4s %06x %5d %5d %-9.9s ",
 		   key->serial,
-		   key->flags & KEY_FLAG_INSTANTIATED	? 'I' : '-',
-		   key->flags & KEY_FLAG_REVOKED	? 'R' : '-',
-		   key->flags & KEY_FLAG_DEAD		? 'D' : '-',
-		   key->flags & KEY_FLAG_IN_QUOTA	? 'Q' : '-',
-		   key->flags & KEY_FLAG_USER_CONSTRUCT	? 'U' : '-',
-		   key->flags & KEY_FLAG_NEGATIVE	? 'N' : '-',
+		   showflag(key, 'I', KEY_FLAG_INSTANTIATED),
+		   showflag(key, 'R', KEY_FLAG_REVOKED),
+		   showflag(key, 'D', KEY_FLAG_DEAD),
+		   showflag(key, 'Q', KEY_FLAG_IN_QUOTA),
+		   showflag(key, 'U', KEY_FLAG_USER_CONSTRUCT),
+		   showflag(key, 'N', KEY_FLAG_NEGATIVE),
 		   atomic_read(&key->usage),
 		   xbuf,
 		   key->perm,
@@ -179,11 +182,13 @@ static int proc_keys_show(struct seq_file *m, void *v)
 		   key->gid,
 		   key->type->name);
 
+#undef showflag
+
 	if (key->type->describe)
 		key->type->describe(key, m);
 	seq_putc(m, '\n');
 
-	read_unlock(&key->lock);
+	rcu_read_unlock();
 
 	return 0;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 2eb0e471cd40..059c350cac46 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -38,10 +38,9 @@ struct key root_user_keyring = {
 	.serial		= 2,
 	.type		= &key_type_keyring,
 	.user		= &root_key_user,
-	.lock		= RW_LOCK_UNLOCKED,
 	.sem		= __RWSEM_INITIALIZER(root_user_keyring.sem),
 	.perm		= KEY_USR_ALL,
-	.flags		= KEY_FLAG_INSTANTIATED,
+	.flags		= 1 << KEY_FLAG_INSTANTIATED,
 	.description	= "_uid.0",
 #ifdef KEY_DEBUGGING
 	.magic		= KEY_DEBUG_MAGIC,
@@ -54,10 +53,9 @@ struct key root_session_keyring = {
 	.serial		= 1,
 	.type		= &key_type_keyring,
 	.user		= &root_key_user,
-	.lock		= RW_LOCK_UNLOCKED,
 	.sem		= __RWSEM_INITIALIZER(root_session_keyring.sem),
 	.perm		= KEY_USR_ALL,
-	.flags		= KEY_FLAG_INSTANTIATED,
+	.flags		= 1 << KEY_FLAG_INSTANTIATED,
 	.description	= "_uid_ses.0",
 #ifdef KEY_DEBUGGING
 	.magic		= KEY_DEBUG_MAGIC,
@@ -349,9 +347,7 @@ void key_fsuid_changed(struct task_struct *tsk)
 	/* update the ownership of the thread keyring */
 	if (tsk->thread_keyring) {
 		down_write(&tsk->thread_keyring->sem);
-		write_lock(&tsk->thread_keyring->lock);
 		tsk->thread_keyring->uid = tsk->fsuid;
-		write_unlock(&tsk->thread_keyring->lock);
 		up_write(&tsk->thread_keyring->sem);
 	}
 
@@ -366,9 +362,7 @@ void key_fsgid_changed(struct task_struct *tsk)
 	/* update the ownership of the thread keyring */
 	if (tsk->thread_keyring) {
 		down_write(&tsk->thread_keyring->sem);
-		write_lock(&tsk->thread_keyring->lock);
 		tsk->thread_keyring->gid = tsk->fsgid;
-		write_unlock(&tsk->thread_keyring->lock);
 		up_write(&tsk->thread_keyring->sem);
 	}
 
@@ -588,7 +582,7 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 	}
 
 	ret = -EIO;
-	if (!partial && !(key->flags & KEY_FLAG_INSTANTIATED))
+	if (!partial && !test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto invalid_key;
 
 	ret = -EACCES;
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 9705b1aeba5d..1f6c0940297f 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -105,7 +105,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	struct key_construction cons;
 	struct timespec now;
 	struct key *key;
-	int ret, negative;
+	int ret, negated;
 
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
@@ -113,9 +113,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	if (IS_ERR(key))
 		goto alloc_failed;
 
-	write_lock(&key->lock);
-	key->flags |= KEY_FLAG_USER_CONSTRUCT;
-	write_unlock(&key->lock);
+	set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);
 
 	cons.key = key;
 	list_add_tail(&cons.link, &key->user->consq);
@@ -130,7 +128,7 @@ static struct key *__request_key_construction(struct key_type *type,
 
 	/* if the key wasn't instantiated, then we want to give an error */
 	ret = -ENOKEY;
-	if (!(key->flags & KEY_FLAG_INSTANTIATED))
+	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto request_failed;
 
 	down_write(&key_construction_sem);
@@ -139,7 +137,7 @@ static struct key *__request_key_construction(struct key_type *type,
 
 	/* also give an error if the key was negatively instantiated */
  check_not_negative:
-	if (key->flags & KEY_FLAG_NEGATIVE) {
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) {
 		key_put(key);
 		key = ERR_PTR(-ENOKEY);
 	}
@@ -152,24 +150,23 @@ static struct key *__request_key_construction(struct key_type *type,
 	 * - remove from construction queue
 	 * - mark the key as dead
 	 */
-	negative = 0;
+	negated = 0;
 	down_write(&key_construction_sem);
 
 	list_del(&cons.link);
 
-	write_lock(&key->lock);
-	key->flags &= ~KEY_FLAG_USER_CONSTRUCT;
-
 	/* check it didn't get instantiated between the check and the down */
-	if (!(key->flags & KEY_FLAG_INSTANTIATED)) {
-		key->flags |= KEY_FLAG_INSTANTIATED | KEY_FLAG_NEGATIVE;
-		negative = 1;
+	if (!test_bit(KEY_FLAG_INSTANTIATED, &key->flags)) {
+		set_bit(KEY_FLAG_NEGATIVE, &key->flags);
+		set_bit(KEY_FLAG_INSTANTIATED, &key->flags);
+		negated = 1;
 	}
 
-	write_unlock(&key->lock);
+	clear_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags);
+
 	up_write(&key_construction_sem);
 
-	if (!negative)
+	if (!negated)
 		goto check_not_negative; /* surprisingly, the key got
 					  * instantiated */
 
@@ -250,7 +247,7 @@ static struct key *request_key_construction(struct key_type *type,
 
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!(ckey->flags & KEY_FLAG_USER_CONSTRUCT))
+		if (!test_bit(KEY_FLAG_USER_CONSTRUCT, &ckey->flags))
 			break;
 		schedule();
 	}
@@ -339,7 +336,8 @@ int key_validate(struct key *key)
 	if (key) {
 		/* check it's still accessible */
 		ret = -EKEYREVOKED;
-		if (key->flags & (KEY_FLAG_REVOKED | KEY_FLAG_DEAD))
+		if (test_bit(KEY_FLAG_REVOKED, &key->flags) ||
+		    test_bit(KEY_FLAG_DEAD, &key->flags))
 			goto error;
 
 		/* check it hasn't expired */
diff --git a/security/keys/user_defined.c b/security/keys/user_defined.c
index 8d65b3a28129..c33d3614a0db 100644
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -42,12 +42,19 @@ struct key_type key_type_user = {
 	.read		= user_read,
 };
 
+struct user_key_payload {
+	struct rcu_head	rcu;		/* RCU destructor */
+	unsigned short	datalen;	/* length of this data */
+	char		data[0];	/* actual data */
+};
+
 /*****************************************************************************/
 /*
  * instantiate a user defined key
  */
 static int user_instantiate(struct key *key, const void *data, size_t datalen)
 {
+	struct user_key_payload *upayload;
 	int ret;
 
 	ret = -EINVAL;
@@ -58,13 +65,15 @@ static int user_instantiate(struct key *key, const void *data, size_t datalen)
 	if (ret < 0)
 		goto error;
 
-	/* attach the data */
 	ret = -ENOMEM;
-	key->payload.data = kmalloc(datalen, GFP_KERNEL);
-	if (!key->payload.data)
+	upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
+	if (!upayload)
 		goto error;
 
-	memcpy(key->payload.data, data, datalen);
+	/* attach the data */
+	upayload->datalen = datalen;
+	memcpy(upayload->data, data, datalen);
+	rcu_assign_pointer(key->payload.data, upayload);
 	ret = 0;
 
  error:
@@ -75,18 +84,25 @@ static int user_instantiate(struct key *key, const void *data, size_t datalen)
 /*****************************************************************************/
 /*
  * duplicate a user defined key
+ * - both keys' semaphores are locked against further modification
+ * - the new key cannot yet be accessed
  */
 static int user_duplicate(struct key *key, const struct key *source)
 {
+	struct user_key_payload *upayload, *spayload;
 	int ret;
 
 	/* just copy the payload */
 	ret = -ENOMEM;
-	key->payload.data = kmalloc(source->datalen, GFP_KERNEL);
+	upayload = kmalloc(sizeof(*upayload) + source->datalen, GFP_KERNEL);
+	if (upayload) {
+		spayload = rcu_dereference(source->payload.data);
+		BUG_ON(source->datalen != spayload->datalen);
 
-	if (key->payload.data) {
-		key->datalen = source->datalen;
-		memcpy(key->payload.data, source->payload.data, source->datalen);
+		upayload->datalen = key->datalen = spayload->datalen;
+		memcpy(upayload->data, spayload->data, key->datalen);
+
+		key->payload.data = upayload;
 		ret = 0;
 	}
 
@@ -94,42 +110,56 @@ static int user_duplicate(struct key *key, const struct key *source)
 
 } /* end user_duplicate() */
 
+/*****************************************************************************/
+/*
+ * dispose of the old data from an updated user defined key
+ */
+static void user_update_rcu_disposal(struct rcu_head *rcu)
+{
+	struct user_key_payload *upayload;
+
+	upayload = container_of(rcu, struct user_key_payload, rcu);
+
+	kfree(upayload);
+
+} /* end user_update_rcu_disposal() */
+
 /*****************************************************************************/
 /*
  * update a user defined key
+ * - the key's semaphore is write-locked
  */
 static int user_update(struct key *key, const void *data, size_t datalen)
 {
-	void *new, *zap;
+	struct user_key_payload *upayload, *zap;
 	int ret;
 
 	ret = -EINVAL;
 	if (datalen <= 0 || datalen > 32767 || !data)
 		goto error;
 
-	/* copy the data */
+	/* construct a replacement payload */
 	ret = -ENOMEM;
-	new = kmalloc(datalen, GFP_KERNEL);
-	if (!new)
+	upayload = kmalloc(sizeof(*upayload) + datalen, GFP_KERNEL);
+	if (!upayload)
 		goto error;
 
-	memcpy(new, data, datalen);
+	upayload->datalen = datalen;
+	memcpy(upayload->data, data, datalen);
 
 	/* check the quota and attach the new data */
-	zap = new;
-	write_lock(&key->lock);
+	zap = upayload;
 
 	ret = key_payload_reserve(key, datalen);
 
 	if (ret == 0) {
 		/* attach the new data, displacing the old */
 		zap = key->payload.data;
-		key->payload.data = new;
+		rcu_assign_pointer(key->payload.data, upayload);
 		key->expiry = 0;
 	}
 
-	write_unlock(&key->lock);
-	kfree(zap);
+	call_rcu(&zap->rcu, user_update_rcu_disposal);
 
  error:
 	return ret;
@@ -152,13 +182,15 @@ static int user_match(const struct key *key, const void *description)
  */
 static void user_destroy(struct key *key)
 {
-	kfree(key->payload.data);
+	struct user_key_payload *upayload = key->payload.data;
+
+	kfree(upayload);
 
 } /* end user_destroy() */
 
 /*****************************************************************************/
 /*
- * describe the user
+ * describe the user key
  */
 static void user_describe(const struct key *key, struct seq_file *m)
 {
@@ -171,18 +203,23 @@ static void user_describe(const struct key *key, struct seq_file *m)
 /*****************************************************************************/
 /*
  * read the key data
+ * - the key's semaphore is read-locked
  */
 static long user_read(const struct key *key,
 		      char __user *buffer, size_t buflen)
 {
-	long ret = key->datalen;
+	struct user_key_payload *upayload;
+	long ret;
+
+	upayload = rcu_dereference(key->payload.data);
+	ret = upayload->datalen;
 
 	/* we can return the data as is */
 	if (buffer && buflen > 0) {
-		if (buflen > key->datalen)
-			buflen = key->datalen;
+		if (buflen > upayload->datalen)
+			buflen = upayload->datalen;
 
-		if (copy_to_user(buffer, key->payload.data, buflen) != 0)
+		if (copy_to_user(buffer, upayload->data, buflen) != 0)
 			ret = -EFAULT;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From 7888e7ff4ee579442128d7d12a9c9dbf2cf7de6a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Jun 2005 22:00:51 -0700
Subject: [PATCH] Keys: Pass session keyring to call_usermodehelper()

The attached patch makes it possible to pass a session keyring through to the
process spawned by call_usermodehelper().  This allows patch 3/3 to pass an
authorisation key through to /sbin/request-key, thus permitting better access
controls when doing just-in-time key creation.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/key.h         | 10 +++++++++-
 include/linux/kmod.h        | 13 ++++++++++++-
 kernel/kmod.c               | 17 +++++++++++++----
 security/keys/request_key.c |  2 +-
 4 files changed, 35 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/key.h b/include/linux/key.h
index 2c24ffaca86f..2bfbf88d2740 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -273,14 +273,22 @@ extern void key_fsuid_changed(struct task_struct *tsk);
 extern void key_fsgid_changed(struct task_struct *tsk);
 extern void key_init(void);
 
+#define __install_session_keyring(tsk, keyring)			\
+({								\
+	struct key *old_session = tsk->signal->session_keyring;	\
+	tsk->signal->session_keyring = keyring;			\
+	old_session;						\
+})
+
 #else /* CONFIG_KEYS */
 
 #define key_validate(k)			0
 #define key_serial(k)			0
-#define key_get(k) 			NULL
+#define key_get(k) 			({ NULL; })
 #define key_put(k)			do { } while(0)
 #define alloc_uid_keyring(u)		0
 #define switch_uid_keyring(u)		do { } while(0)
+#define __install_session_keyring(t, k)	({ NULL; })
 #define copy_keys(f,t)			0
 #define copy_thread_group_keys(t)	0
 #define exit_keys(t)			do { } while(0)
diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 95d0e4b0814d..e4a231549407 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -19,6 +19,7 @@
  *      Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+#include <linux/stddef.h>
 #include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/compiler.h>
@@ -34,7 +35,17 @@ static inline int request_module(const char * name, ...) { return -ENOSYS; }
 #endif
 
 #define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
-extern int call_usermodehelper(char *path, char *argv[], char *envp[], int wait);
+
+struct key;
+extern int call_usermodehelper_keys(char *path, char *argv[], char *envp[],
+				    struct key *session_keyring, int wait);
+
+static inline int
+call_usermodehelper(char *path, char **argv, char **envp, int wait)
+{
+	return call_usermodehelper_keys(path, argv, envp, NULL, wait);
+}
+
 extern void usermodehelper_init(void);
 
 #endif /* __LINUX_KMOD_H__ */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
 	char *path;
 	char **argv;
 	char **envp;
+	struct key *ring;
 	int wait;
 	int retval;
 };
@@ -130,16 +131,21 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
+	struct key *old_session;
 	int retval;
 
-	/* Unblock all signals. */
+	/* Unblock all signals and set the session keyring. */
+	key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
+	old_session = __install_session_keyring(current, sub_info->ring);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
+	key_put(old_session);
+
 	/* We can run anywhere, unlike our parent keventd(). */
 	set_cpus_allowed(current, CPU_MASK_ALL);
 
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
 }
 
 /**
- * call_usermodehelper - start a usermode application
+ * call_usermodehelper_keys - start a usermode application
  * @path: pathname for the application
  * @argv: null-terminated argument list
  * @envp: null-terminated environment list
+ * @session_keyring: session keyring for process (NULL for an empty keyring)
  * @wait: wait for the application to finish and return status.
  *
  * Runs a user-space application.  The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
  * Must be called from process context.  Returns a negative error code
  * if program was not execed successfully, or 0.
  */
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper_keys(char *path, char **argv, char **envp,
+			     struct key *session_keyring, int wait)
 {
 	DECLARE_COMPLETION(done);
 	struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 		.path		= path,
 		.argv		= argv,
 		.envp		= envp,
+		.ring		= session_keyring,
 		.wait		= wait,
 		.retval		= 0,
 	};
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	wait_for_completion(&done);
 	return sub_info.retval;
 }
-EXPORT_SYMBOL(call_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper_keys);
 
 void __init usermodehelper_init(void)
 {
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 1f6c0940297f..1919540f047d 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -88,7 +88,7 @@ static int call_request_key(struct key *key,
 	argv[i] = NULL;
 
 	/* do it */
-	return call_usermodehelper(argv[0], argv, envp, 1);
+	return call_usermodehelper_keys(argv[0], argv, envp, NULL, 1);
 
 } /* end call_request_key() */
 
-- 
cgit v1.2.3-59-g8ed1b


From 3e30148c3d524a9c1c63ca28261bc24c457eb07a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 23 Jun 2005 22:00:56 -0700
Subject: [PATCH] Keys: Make request-key create an authorisation key

The attached patch makes the following changes:

 (1) There's a new special key type called ".request_key_auth".

     This is an authorisation key for when one process requests a key and
     another process is started to construct it. This type of key cannot be
     created by the user; nor can it be requested by kernel services.

     Authorisation keys hold two references:

     (a) Each refers to a key being constructed. When the key being
     	 constructed is instantiated the authorisation key is revoked,
     	 rendering it of no further use.

     (b) The "authorising process". This is either:

     	 (i) the process that called request_key(), or:

     	 (ii) if the process that called request_key() itself had an
     	      authorisation key in its session keyring, then the authorising
     	      process referred to by that authorisation key will also be
     	      referred to by the new authorisation key.

	 This means that the process that initiated a chain of key requests
	 will authorise the lot of them, and will, by default, wind up with
	 the keys obtained from them in its keyrings.

 (2) request_key() creates an authorisation key which is then passed to
     /sbin/request-key in as part of a new session keyring.

 (3) When request_key() is searching for a key to hand back to the caller, if
     it comes across an authorisation key in the session keyring of the
     calling process, it will also search the keyrings of the process
     specified therein and it will use the specified process's credentials
     (fsuid, fsgid, groups) to do that rather than the calling process's
     credentials.

     This allows a process started by /sbin/request-key to find keys belonging
     to the authorising process.

 (4) A key can be read, even if the process executing KEYCTL_READ doesn't have
     direct read or search permission if that key is contained within the
     keyrings of a process specified by an authorisation key found within the
     calling process's session keyring, and is searchable using the
     credentials of the authorising process.

     This allows a process started by /sbin/request-key to read keys belonging
     to the authorising process.

 (5) The magic KEY_SPEC_*_KEYRING key IDs when passed to KEYCTL_INSTANTIATE or
     KEYCTL_NEGATE will specify a keyring of the authorising process, rather
     than the process doing the instantiation.

 (6) One of the process keyrings can be nominated as the default to which
     request_key() should attach new keys if not otherwise specified. This is
     done with KEYCTL_SET_REQKEY_KEYRING and one of the KEY_REQKEY_DEFL_*
     constants. The current setting can also be read using this call.

 (7) request_key() is partially interruptible. If it is waiting for another
     process to finish constructing a key, it can be interrupted. This permits
     a request-key cycle to be broken without recourse to rebooting.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-Off-By: Benoit Boissinot <benoit.boissinot@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/keys.txt           |  34 ++++++++
 include/linux/key-ui.h           |  41 ++++++++-
 include/linux/key.h              |   9 +-
 include/linux/keyctl.h           |  11 +++
 include/linux/sched.h            |   8 +-
 kernel/sys.c                     |   2 +-
 security/keys/Makefile           |   5 +-
 security/keys/compat.c           |   7 +-
 security/keys/internal.h         |  45 +++++++++-
 security/keys/key.c              |  24 ++++--
 security/keys/keyctl.c           | 176 ++++++++++++++++++++++++-------------
 security/keys/keyring.c          |  67 ++++++++++++--
 security/keys/process_keys.c     | 179 +++++++++++++++++++++++---------------
 security/keys/request_key.c      | 182 ++++++++++++++++++++++++++++++++-------
 security/keys/request_key_auth.c | 180 ++++++++++++++++++++++++++++++++++++++
 15 files changed, 779 insertions(+), 191 deletions(-)
 create mode 100644 security/keys/request_key_auth.c

(limited to 'include/linux')

diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 3df40c1fe15a..0321ded4b9ae 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -591,6 +591,37 @@ The keyctl syscall functions are:
      this case too.
 
 
+ (*) Set the default request-key destination keyring.
+
+	long keyctl(KEYCTL_SET_REQKEY_KEYRING, int reqkey_defl);
+
+     This sets the default keyring to which implicitly requested keys will be
+     attached for this thread. reqkey_defl should be one of these constants:
+
+	CONSTANT				VALUE	NEW DEFAULT KEYRING
+	======================================	======	=======================
+	KEY_REQKEY_DEFL_NO_CHANGE		-1	No change
+	KEY_REQKEY_DEFL_DEFAULT			0	Default[1]
+	KEY_REQKEY_DEFL_THREAD_KEYRING		1	Thread keyring
+	KEY_REQKEY_DEFL_PROCESS_KEYRING		2	Process keyring
+	KEY_REQKEY_DEFL_SESSION_KEYRING		3	Session keyring
+	KEY_REQKEY_DEFL_USER_KEYRING		4	User keyring
+	KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5	User session keyring
+	KEY_REQKEY_DEFL_GROUP_KEYRING		6	Group keyring
+
+     The old default will be returned if successful and error EINVAL will be
+     returned if reqkey_defl is not one of the above values.
+
+     The default keyring can be overridden by the keyring indicated to the
+     request_key() system call.
+
+     Note that this setting is inherited across fork/exec.
+
+     [1] The default default is: the thread keyring if there is one, otherwise
+     the process keyring if there is one, otherwise the session keyring if
+     there is one, otherwise the user default session keyring.
+
+
 ===============
 KERNEL SERVICES
 ===============
@@ -626,6 +657,9 @@ payload contents" for more information.
     Should the function fail error ENOKEY, EKEYEXPIRED or EKEYREVOKED will be
     returned.
 
+    If successful, the key will have been attached to the default keyring for
+    implicitly obtained request-key keys, as set by KEYCTL_SET_REQKEY_KEYRING.
+
 
 (*) When it is no longer required, the key should be released using:
 
diff --git a/include/linux/key-ui.h b/include/linux/key-ui.h
index 159ca8d54e9a..cc326174a808 100644
--- a/include/linux/key-ui.h
+++ b/include/linux/key-ui.h
@@ -1,4 +1,4 @@
-/* key-ui.h: key userspace interface stuff for use by keyfs
+/* key-ui.h: key userspace interface stuff
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -84,8 +84,45 @@ static inline int key_any_permission(const struct key *key, key_perm_t perm)
 	return kperm != 0;
 }
 
+static inline int key_task_groups_search(struct task_struct *tsk, gid_t gid)
+{
+	int ret;
+
+	task_lock(tsk);
+	ret = groups_search(tsk->group_info, gid);
+	task_unlock(tsk);
+	return ret;
+}
+
+static inline int key_task_permission(const struct key *key,
+				      struct task_struct *context,
+				      key_perm_t perm)
+{
+	key_perm_t kperm;
+
+	if (key->uid == context->fsuid) {
+		kperm = key->perm >> 16;
+	}
+	else if (key->gid != -1 &&
+		 key->perm & KEY_GRP_ALL && (
+			 key->gid == context->fsgid ||
+			 key_task_groups_search(context, key->gid)
+			 )
+		 ) {
+		kperm = key->perm >> 8;
+	}
+	else {
+		kperm = key->perm;
+	}
+
+	kperm = kperm & perm & KEY_ALL;
+
+	return kperm == perm;
+
+}
 
-extern struct key *lookup_user_key(key_serial_t id, int create, int part,
+extern struct key *lookup_user_key(struct task_struct *context,
+				   key_serial_t id, int create, int partial,
 				   key_perm_t perm);
 
 extern long join_session_keyring(const char *name);
diff --git a/include/linux/key.h b/include/linux/key.h
index 2bfbf88d2740..970bbd916cf4 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -199,10 +199,12 @@ extern int key_payload_reserve(struct key *key, size_t datalen);
 extern int key_instantiate_and_link(struct key *key,
 				    const void *data,
 				    size_t datalen,
-				    struct key *keyring);
+				    struct key *keyring,
+				    struct key *instkey);
 extern int key_negate_and_link(struct key *key,
 			       unsigned timeout,
-			       struct key *keyring);
+			       struct key *keyring,
+			       struct key *instkey);
 extern void key_revoke(struct key *key);
 extern void key_put(struct key *key);
 
@@ -245,9 +247,6 @@ extern struct key *keyring_search(struct key *keyring,
 				  struct key_type *type,
 				  const char *description);
 
-extern struct key *search_process_keyrings(struct key_type *type,
-					   const char *description);
-
 extern int keyring_add_key(struct key *keyring,
 			   struct key *key);
 
diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h
index 381dedc370a3..8d7c59a29e09 100644
--- a/include/linux/keyctl.h
+++ b/include/linux/keyctl.h
@@ -20,6 +20,16 @@
 #define KEY_SPEC_USER_SESSION_KEYRING	-5	/* - key ID for UID-session keyring */
 #define KEY_SPEC_GROUP_KEYRING		-6	/* - key ID for GID-specific keyring */
 
+/* request-key default keyrings */
+#define KEY_REQKEY_DEFL_NO_CHANGE		-1
+#define KEY_REQKEY_DEFL_DEFAULT			0
+#define KEY_REQKEY_DEFL_THREAD_KEYRING		1
+#define KEY_REQKEY_DEFL_PROCESS_KEYRING		2
+#define KEY_REQKEY_DEFL_SESSION_KEYRING		3
+#define KEY_REQKEY_DEFL_USER_KEYRING		4
+#define KEY_REQKEY_DEFL_USER_SESSION_KEYRING	5
+#define KEY_REQKEY_DEFL_GROUP_KEYRING		6
+
 /* keyctl commands */
 #define KEYCTL_GET_KEYRING_ID		0	/* ask for a keyring's ID */
 #define KEYCTL_JOIN_SESSION_KEYRING	1	/* join or start named session keyring */
@@ -35,5 +45,6 @@
 #define KEYCTL_READ			11	/* read a key or keyring's contents */
 #define KEYCTL_INSTANTIATE		12	/* instantiate a partially constructed key */
 #define KEYCTL_NEGATE			13	/* negate a partially constructed key */
+#define KEYCTL_SET_REQKEY_KEYRING	14	/* set default request-key keyring */
 
 #endif /*  _LINUX_KEYCTL_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 901742f92389..2c69682b0444 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -561,9 +561,10 @@ struct group_info {
 		groups_free(group_info); \
 } while (0)
 
-struct group_info *groups_alloc(int gidsetsize);
-void groups_free(struct group_info *group_info);
-int set_current_groups(struct group_info *group_info);
+extern struct group_info *groups_alloc(int gidsetsize);
+extern void groups_free(struct group_info *group_info);
+extern int set_current_groups(struct group_info *group_info);
+extern int groups_search(struct group_info *group_info, gid_t grp);
 /* access the groups "array" with this macro */
 #define GROUP_AT(gi, i) \
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
@@ -660,6 +661,7 @@ struct task_struct {
 	struct user_struct *user;
 #ifdef CONFIG_KEYS
 	struct key *thread_keyring;	/* keyring private to this thread */
+	unsigned char jit_keyring;	/* default keyring to attach requested keys to */
 #endif
 	int oomkilladj; /* OOM kill score adjustment (bit shift). */
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
diff --git a/kernel/sys.c b/kernel/sys.c
index 5a9d6b075016..da24bc1292db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1259,7 +1259,7 @@ static void groups_sort(struct group_info *group_info)
 }
 
 /* a simple bsearch */
-static int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(struct group_info *group_info, gid_t grp)
 {
 	int left, right;
 
diff --git a/security/keys/Makefile b/security/keys/Makefile
index ddb495d65062..c392d750b208 100644
--- a/security/keys/Makefile
+++ b/security/keys/Makefile
@@ -7,8 +7,9 @@ obj-y := \
 	keyring.o \
 	keyctl.o \
 	process_keys.o \
-	user_defined.o \
-	request_key.o
+	request_key.o \
+	request_key_auth.o \
+	user_defined.o
 
 obj-$(CONFIG_KEYS_COMPAT) += compat.o
 obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/security/keys/compat.c b/security/keys/compat.c
index aff8b22dcb5c..3303673c636e 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -1,6 +1,6 @@
 /* compat.c: 32-bit compatibility syscall for 64-bit systems
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -24,7 +24,7 @@
  * - if you can, you should call sys_keyctl directly
  */
 asmlinkage long compat_sys_keyctl(u32 option,
-			      u32 arg2, u32 arg3, u32 arg4, u32 arg5)
+				  u32 arg2, u32 arg3, u32 arg4, u32 arg5)
 {
 	switch (option) {
 	case KEYCTL_GET_KEYRING_ID:
@@ -71,6 +71,9 @@ asmlinkage long compat_sys_keyctl(u32 option,
 	case KEYCTL_NEGATE:
 		return keyctl_negate_key(arg2, arg3, arg4);
 
+	case KEYCTL_SET_REQKEY_KEYRING:
+		return keyctl_set_reqkey_keyring(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 67b2b93a7489..46c8602661c9 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -1,6 +1,6 @@
 /* internal.h: authentication token and access key management internal defs
  *
- * Copyright (C) 2003 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2003-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -15,6 +15,16 @@
 #include <linux/key.h>
 #include <linux/key-ui.h>
 
+#if 0
+#define kenter(FMT, a...)	printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
+#define kleave(FMT, a...)	printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
+#define kdebug(FMT, a...)	printk(FMT"\n" , ## a)
+#else
+#define kenter(FMT, a...)	do {} while(0)
+#define kleave(FMT, a...)	do {} while(0)
+#define kdebug(FMT, a...)	do {} while(0)
+#endif
+
 extern struct key_type key_type_dead;
 extern struct key_type key_type_user;
 
@@ -66,20 +76,46 @@ extern struct key *__keyring_search_one(struct key *keyring,
 					const char *description,
 					key_perm_t perm);
 
+extern struct key *keyring_search_instkey(struct key *keyring,
+					  key_serial_t target_id);
+
 typedef int (*key_match_func_t)(const struct key *, const void *);
 
 extern struct key *keyring_search_aux(struct key *keyring,
+				      struct task_struct *tsk,
 				      struct key_type *type,
 				      const void *description,
 				      key_match_func_t match);
 
-extern struct key *search_process_keyrings_aux(struct key_type *type,
-					       const void *description,
-					       key_match_func_t match);
+extern struct key *search_process_keyrings(struct key_type *type,
+					   const void *description,
+					   key_match_func_t match,
+					   struct task_struct *tsk);
 
 extern struct key *find_keyring_by_name(const char *name, key_serial_t bound);
 
 extern int install_thread_keyring(struct task_struct *tsk);
+extern int install_process_keyring(struct task_struct *tsk);
+
+extern struct key *request_key_and_link(struct key_type *type,
+					const char *description,
+					const char *callout_info,
+					struct key *dest_keyring);
+
+/*
+ * request_key authorisation
+ */
+struct request_key_auth {
+	struct key		*target_key;
+	struct task_struct	*context;
+	pid_t			pid;
+};
+
+extern struct key_type key_type_request_key_auth;
+extern struct key *request_key_auth_new(struct key *target,
+					struct key **_rkakey);
+
+extern struct key *key_get_instantiation_authkey(key_serial_t target_id);
 
 /*
  * keyctl functions
@@ -100,6 +136,7 @@ extern long keyctl_setperm_key(key_serial_t, key_perm_t);
 extern long keyctl_instantiate_key(key_serial_t, const void __user *,
 				   size_t, key_serial_t);
 extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t);
+extern long keyctl_set_reqkey_keyring(int);
 
 
 /*
diff --git a/security/keys/key.c b/security/keys/key.c
index 1fdfccb3fe43..3304d37bb379 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -1,6 +1,6 @@
 /* key.c: basic authentication token and access key management
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -391,7 +391,8 @@ EXPORT_SYMBOL(key_payload_reserve);
 static int __key_instantiate_and_link(struct key *key,
 				      const void *data,
 				      size_t datalen,
-				      struct key *keyring)
+				      struct key *keyring,
+				      struct key *instkey)
 {
 	int ret, awaken;
 
@@ -419,6 +420,10 @@ static int __key_instantiate_and_link(struct key *key,
 			/* and link it into the destination keyring */
 			if (keyring)
 				ret = __key_link(keyring, key);
+
+			/* disable the authorisation key */
+			if (instkey)
+				key_revoke(instkey);
 		}
 	}
 
@@ -439,19 +444,21 @@ static int __key_instantiate_and_link(struct key *key,
 int key_instantiate_and_link(struct key *key,
 			     const void *data,
 			     size_t datalen,
-			     struct key *keyring)
+			     struct key *keyring,
+			     struct key *instkey)
 {
 	int ret;
 
 	if (keyring)
 		down_write(&keyring->sem);
 
-	ret = __key_instantiate_and_link(key, data, datalen, keyring);
+	ret = __key_instantiate_and_link(key, data, datalen, keyring, instkey);
 
 	if (keyring)
 		up_write(&keyring->sem);
 
 	return ret;
+
 } /* end key_instantiate_and_link() */
 
 EXPORT_SYMBOL(key_instantiate_and_link);
@@ -462,7 +469,8 @@ EXPORT_SYMBOL(key_instantiate_and_link);
  */
 int key_negate_and_link(struct key *key,
 			unsigned timeout,
-			struct key *keyring)
+			struct key *keyring,
+			struct key *instkey)
 {
 	struct timespec now;
 	int ret, awaken;
@@ -495,6 +503,10 @@ int key_negate_and_link(struct key *key,
 		/* and link it into the destination keyring */
 		if (keyring)
 			ret = __key_link(keyring, key);
+
+		/* disable the authorisation key */
+		if (instkey)
+			key_revoke(instkey);
 	}
 
 	up_write(&key_construction_sem);
@@ -781,7 +793,7 @@ struct key *key_create_or_update(struct key *keyring,
 	}
 
 	/* instantiate it and link it into the target keyring */
-	ret = __key_instantiate_and_link(key, payload, plen, keyring);
+	ret = __key_instantiate_and_link(key, payload, plen, keyring, NULL);
 	if (ret < 0) {
 		key_put(key);
 		key = ERR_PTR(ret);
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index cedb7326de29..fea262860ea0 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1,6 +1,6 @@
 /* keyctl.c: userspace keyctl operations
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -49,6 +49,13 @@ asmlinkage long sys_add_key(const char __user *_type,
 		goto error;
 	type[31] = '\0';
 
+	if (!type[0])
+		goto error;
+
+	ret = -EPERM;
+	if (type[0] == '.')
+		goto error;
+
 	ret = -EFAULT;
 	dlen = strnlen_user(_description, PAGE_SIZE - 1);
 	if (dlen <= 0)
@@ -82,7 +89,7 @@ asmlinkage long sys_add_key(const char __user *_type,
 	}
 
 	/* find the target keyring (which must be writable) */
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error3;
@@ -181,7 +188,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 	/* get the destination keyring if specified */
 	dest = NULL;
 	if (destringid) {
-		dest = lookup_user_key(destringid, 1, 0, KEY_WRITE);
+		dest = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest)) {
 			ret = PTR_ERR(dest);
 			goto error3;
@@ -196,23 +203,15 @@ asmlinkage long sys_request_key(const char __user *_type,
 	}
 
 	/* do the search */
-	key = request_key(ktype, description, callout_info);
+	key = request_key_and_link(ktype, description, callout_info, dest);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error5;
 	}
 
-	/* link the resulting key to the destination keyring */
-	if (dest) {
-		ret = key_link(dest, key);
-		if (ret < 0)
-			goto error6;
-	}
-
 	ret = key->serial;
 
- error6:
-	key_put(key);
+ 	key_put(key);
  error5:
 	key_type_put(ktype);
  error4:
@@ -237,7 +236,7 @@ long keyctl_get_keyring_ID(key_serial_t id, int create)
 	struct key *key;
 	long ret;
 
-	key = lookup_user_key(id, create, 0, KEY_SEARCH);
+	key = lookup_user_key(NULL, id, create, 0, KEY_SEARCH);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -324,7 +323,7 @@ long keyctl_update_key(key_serial_t id,
 	}
 
 	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 0, KEY_WRITE);
+	key = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -352,7 +351,7 @@ long keyctl_revoke_key(key_serial_t id)
 	struct key *key;
 	long ret;
 
-	key = lookup_user_key(id, 0, 0, KEY_WRITE);
+	key = lookup_user_key(NULL, id, 0, 0, KEY_WRITE);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -378,7 +377,7 @@ long keyctl_keyring_clear(key_serial_t ringid)
 	struct key *keyring;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
@@ -404,13 +403,13 @@ long keyctl_keyring_link(key_serial_t id, key_serial_t ringid)
 	struct key *keyring, *key;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
 	}
 
-	key = lookup_user_key(id, 1, 0, KEY_LINK);
+	key = lookup_user_key(NULL, id, 1, 0, KEY_LINK);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -438,13 +437,13 @@ long keyctl_keyring_unlink(key_serial_t id, key_serial_t ringid)
 	struct key *keyring, *key;
 	long ret;
 
-	keyring = lookup_user_key(ringid, 0, 0, KEY_WRITE);
+	keyring = lookup_user_key(NULL, ringid, 0, 0, KEY_WRITE);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error;
 	}
 
-	key = lookup_user_key(id, 0, 0, 0);
+	key = lookup_user_key(NULL, id, 0, 0, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error2;
@@ -475,16 +474,29 @@ long keyctl_describe_key(key_serial_t keyid,
 			 char __user *buffer,
 			 size_t buflen)
 {
-	struct key *key;
+	struct key *key, *instkey;
 	char *tmpbuf;
 	long ret;
 
-	key = lookup_user_key(keyid, 0, 1, KEY_VIEW);
+	key = lookup_user_key(NULL, keyid, 0, 1, KEY_VIEW);
 	if (IS_ERR(key)) {
+		/* viewing a key under construction is permitted if we have the
+		 * authorisation token handy */
+		if (PTR_ERR(key) == -EACCES) {
+			instkey = key_get_instantiation_authkey(keyid);
+			if (!IS_ERR(instkey)) {
+				key_put(instkey);
+				key = lookup_user_key(NULL, keyid, 0, 1, 0);
+				if (!IS_ERR(key))
+					goto okay;
+			}
+		}
+
 		ret = PTR_ERR(key);
 		goto error;
 	}
 
+okay:
 	/* calculate how much description we're going to return */
 	ret = -ENOMEM;
 	tmpbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -568,7 +580,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 		goto error2;
 
 	/* get the keyring at which to begin the search */
-	keyring = lookup_user_key(ringid, 0, 0, KEY_SEARCH);
+	keyring = lookup_user_key(NULL, ringid, 0, 0, KEY_SEARCH);
 	if (IS_ERR(keyring)) {
 		ret = PTR_ERR(keyring);
 		goto error2;
@@ -577,7 +589,7 @@ long keyctl_keyring_search(key_serial_t ringid,
 	/* get the destination keyring if specified */
 	dest = NULL;
 	if (destringid) {
-		dest = lookup_user_key(destringid, 1, 0, KEY_WRITE);
+		dest = lookup_user_key(NULL, destringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(dest)) {
 			ret = PTR_ERR(dest);
 			goto error3;
@@ -656,24 +668,23 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen)
 	long ret;
 
 	/* find the key first */
-	key = lookup_user_key(keyid, 0, 0, 0);
+	key = lookup_user_key(NULL, keyid, 0, 0, 0);
 	if (!IS_ERR(key)) {
 		/* see if we can read it directly */
 		if (key_permission(key, KEY_READ))
 			goto can_read_key;
 
-		/* can't; see if it's searchable from this process's
-		 * keyrings */
-		ret = -ENOKEY;
-		if (key_permission(key, KEY_SEARCH)) {
-			/* okay - we do have search permission on the key
-			 * itself, but do we have the key? */
-			skey = search_process_keyrings_aux(key->type, key,
-							   keyctl_read_key_same);
-			if (!IS_ERR(skey))
-				goto can_read_key2;
-		}
-
+		/* we can't; see if it's searchable from this process's
+		 * keyrings
+		 * - we automatically take account of the fact that it may be
+		 *   dangling off an instantiation key
+		 */
+		skey = search_process_keyrings(key->type, key,
+					       keyctl_read_key_same, current);
+		if (!IS_ERR(skey))
+			goto can_read_key2;
+
+		ret = PTR_ERR(skey);
 		goto error2;
 	}
 
@@ -719,7 +730,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	if (uid == (uid_t) -1 && gid == (gid_t) -1)
 		goto error;
 
-	key = lookup_user_key(id, 1, 1, 0);
+	key = lookup_user_key(NULL, id, 1, 1, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -776,7 +787,7 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 	if (perm & ~(KEY_USR_ALL | KEY_GRP_ALL | KEY_OTH_ALL))
 		goto error;
 
-	key = lookup_user_key(id, 1, 1, 0);
+	key = lookup_user_key(NULL, id, 1, 1, 0);
 	if (IS_ERR(key)) {
 		ret = PTR_ERR(key);
 		goto error;
@@ -809,7 +820,8 @@ long keyctl_instantiate_key(key_serial_t id,
 			    size_t plen,
 			    key_serial_t ringid)
 {
-	struct key *key, *keyring;
+	struct request_key_auth *rka;
+	struct key *instkey, *keyring;
 	void *payload;
 	long ret;
 
@@ -831,18 +843,21 @@ long keyctl_instantiate_key(key_serial_t id,
 			goto error2;
 	}
 
-	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 1, KEY_WRITE);
-	if (IS_ERR(key)) {
-		ret = PTR_ERR(key);
+	/* find the instantiation authorisation key */
+	instkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(instkey)) {
+		ret = PTR_ERR(instkey);
 		goto error2;
 	}
 
-	/* find the destination keyring if present (which must also be
-	 * writable) */
+	rka = instkey->payload.data;
+
+	/* find the destination keyring amongst those belonging to the
+	 * requesting task */
 	keyring = NULL;
 	if (ringid) {
-		keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		keyring = lookup_user_key(rka->context, ringid, 1, 0,
+					  KEY_WRITE);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error3;
@@ -850,11 +865,12 @@ long keyctl_instantiate_key(key_serial_t id,
 	}
 
 	/* instantiate the key and link it into a keyring */
-	ret = key_instantiate_and_link(key, payload, plen, keyring);
+	ret = key_instantiate_and_link(rka->target_key, payload, plen,
+				       keyring, instkey);
 
 	key_put(keyring);
  error3:
-	key_put(key);
+	key_put(instkey);
  error2:
 	kfree(payload);
  error:
@@ -869,21 +885,24 @@ long keyctl_instantiate_key(key_serial_t id,
  */
 long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 {
-	struct key *key, *keyring;
+	struct request_key_auth *rka;
+	struct key *instkey, *keyring;
 	long ret;
 
-	/* find the target key (which must be writable) */
-	key = lookup_user_key(id, 0, 1, KEY_WRITE);
-	if (IS_ERR(key)) {
-		ret = PTR_ERR(key);
+	/* find the instantiation authorisation key */
+	instkey = key_get_instantiation_authkey(id);
+	if (IS_ERR(instkey)) {
+		ret = PTR_ERR(instkey);
 		goto error;
 	}
 
+	rka = instkey->payload.data;
+
 	/* find the destination keyring if present (which must also be
 	 * writable) */
 	keyring = NULL;
 	if (ringid) {
-		keyring = lookup_user_key(ringid, 1, 0, KEY_WRITE);
+		keyring = lookup_user_key(NULL, ringid, 1, 0, KEY_WRITE);
 		if (IS_ERR(keyring)) {
 			ret = PTR_ERR(keyring);
 			goto error2;
@@ -891,16 +910,54 @@ long keyctl_negate_key(key_serial_t id, unsigned timeout, key_serial_t ringid)
 	}
 
 	/* instantiate the key and link it into a keyring */
-	ret = key_negate_and_link(key, timeout, keyring);
+	ret = key_negate_and_link(rka->target_key, timeout, keyring, instkey);
 
 	key_put(keyring);
  error2:
-	key_put(key);
+	key_put(instkey);
  error:
 	return ret;
 
 } /* end keyctl_negate_key() */
 
+/*****************************************************************************/
+/*
+ * set the default keyring in which request_key() will cache keys
+ * - return the old setting
+ */
+long keyctl_set_reqkey_keyring(int reqkey_defl)
+{
+	int ret;
+
+	switch (reqkey_defl) {
+	case KEY_REQKEY_DEFL_THREAD_KEYRING:
+		ret = install_thread_keyring(current);
+		if (ret < 0)
+			return ret;
+		goto set;
+
+	case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+		ret = install_process_keyring(current);
+		if (ret < 0)
+			return ret;
+
+	case KEY_REQKEY_DEFL_DEFAULT:
+	case KEY_REQKEY_DEFL_SESSION_KEYRING:
+	case KEY_REQKEY_DEFL_USER_KEYRING:
+	case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+	set:
+		current->jit_keyring = reqkey_defl;
+
+	case KEY_REQKEY_DEFL_NO_CHANGE:
+		return current->jit_keyring;
+
+	case KEY_REQKEY_DEFL_GROUP_KEYRING:
+	default:
+		return -EINVAL;
+	}
+
+} /* end keyctl_set_reqkey_keyring() */
+
 /*****************************************************************************/
 /*
  * the key control system call
@@ -971,6 +1028,9 @@ asmlinkage long sys_keyctl(int option, unsigned long arg2, unsigned long arg3,
 					 (unsigned) arg3,
 					 (key_serial_t) arg4);
 
+	case KEYCTL_SET_REQKEY_KEYRING:
+		return keyctl_set_reqkey_keyring(arg2);
+
 	default:
 		return -EOPNOTSUPP;
 	}
diff --git a/security/keys/keyring.c b/security/keys/keyring.c
index c9a5de197487..90a551e4da66 100644
--- a/security/keys/keyring.c
+++ b/security/keys/keyring.c
@@ -1,6 +1,6 @@
 /* keyring.c: keyring handling
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -308,7 +308,7 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
 			    uid, gid, KEY_USR_ALL, not_in_quota);
 
 	if (!IS_ERR(keyring)) {
-		ret = key_instantiate_and_link(keyring, NULL, 0, dest);
+		ret = key_instantiate_and_link(keyring, NULL, 0, dest, NULL);
 		if (ret < 0) {
 			key_put(keyring);
 			keyring = ERR_PTR(ret);
@@ -326,11 +326,12 @@ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid,
  * - we only find keys on which we have search permission
  * - we use the supplied match function to see if the description (or other
  *   feature of interest) matches
- * - we readlock the keyrings as we search down the tree
+ * - we rely on RCU to prevent the keyring lists from disappearing on us
  * - we return -EAGAIN if we didn't find any matching key
  * - we return -ENOKEY if we only found negative matching keys
  */
 struct key *keyring_search_aux(struct key *keyring,
+			       struct task_struct *context,
 			       struct key_type *type,
 			       const void *description,
 			       key_match_func_t match)
@@ -352,7 +353,7 @@ struct key *keyring_search_aux(struct key *keyring,
 
 	/* top keyring must have search permission to begin the search */
 	key = ERR_PTR(-EACCES);
-	if (!key_permission(keyring, KEY_SEARCH))
+	if (!key_task_permission(keyring, context, KEY_SEARCH))
 		goto error;
 
 	key = ERR_PTR(-ENOTDIR);
@@ -392,7 +393,7 @@ struct key *keyring_search_aux(struct key *keyring,
 			continue;
 
 		/* key must have search permissions */
-		if (!key_permission(key, KEY_SEARCH))
+		if (!key_task_permission(key, context, KEY_SEARCH))
 			continue;
 
 		/* we set a different error code if we find a negative key */
@@ -418,7 +419,7 @@ struct key *keyring_search_aux(struct key *keyring,
 		if (sp >= KEYRING_SEARCH_MAX_DEPTH)
 			continue;
 
-		if (!key_permission(key, KEY_SEARCH))
+		if (!key_task_permission(key, context, KEY_SEARCH))
 			continue;
 
 		/* stack the current position */
@@ -468,7 +469,11 @@ struct key *keyring_search(struct key *keyring,
 			   struct key_type *type,
 			   const char *description)
 {
-	return keyring_search_aux(keyring, type, description, type->match);
+	if (!type->match)
+		return ERR_PTR(-ENOKEY);
+
+	return keyring_search_aux(keyring, current,
+				  type, description, type->match);
 
 } /* end keyring_search() */
 
@@ -496,7 +501,8 @@ struct key *__keyring_search_one(struct key *keyring,
 			key = klist->keys[loop];
 
 			if (key->type == ktype &&
-			    key->type->match(key, description) &&
+			    (!key->type->match ||
+			     key->type->match(key, description)) &&
 			    key_permission(key, perm) &&
 			    !test_bit(KEY_FLAG_REVOKED, &key->flags)
 			    )
@@ -515,6 +521,51 @@ struct key *__keyring_search_one(struct key *keyring,
 
 } /* end __keyring_search_one() */
 
+/*****************************************************************************/
+/*
+ * search for an instantiation authorisation key matching a target key
+ * - the RCU read lock must be held by the caller
+ * - a target_id of zero specifies any valid token
+ */
+struct key *keyring_search_instkey(struct key *keyring,
+				   key_serial_t target_id)
+{
+	struct request_key_auth *rka;
+	struct keyring_list *klist;
+	struct key *instkey;
+	int loop;
+
+	klist = rcu_dereference(keyring->payload.subscriptions);
+	if (klist) {
+		for (loop = 0; loop < klist->nkeys; loop++) {
+			instkey = klist->keys[loop];
+
+			if (instkey->type != &key_type_request_key_auth)
+				continue;
+
+			rka = instkey->payload.data;
+			if (target_id && rka->target_key->serial != target_id)
+				continue;
+
+			/* the auth key is revoked during instantiation */
+			if (!test_bit(KEY_FLAG_REVOKED, &instkey->flags))
+				goto found;
+
+			instkey = ERR_PTR(-EKEYREVOKED);
+			goto error;
+		}
+	}
+
+	instkey = ERR_PTR(-EACCES);
+	goto error;
+
+found:
+	atomic_inc(&instkey->usage);
+error:
+	return instkey;
+
+} /* end keyring_search_instkey() */
+
 /*****************************************************************************/
 /*
  * find a keyring with the specified name
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 972e30172687..34db087bbcc7 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -165,7 +165,7 @@ int install_thread_keyring(struct task_struct *tsk)
 /*
  * make sure a process keyring is installed
  */
-static int install_process_keyring(struct task_struct *tsk)
+int install_process_keyring(struct task_struct *tsk)
 {
 	unsigned long flags;
 	struct key *keyring;
@@ -376,12 +376,13 @@ void key_fsgid_changed(struct task_struct *tsk)
  * - we return -EAGAIN if we didn't find any matching key
  * - we return -ENOKEY if we found only negative matching keys
  */
-struct key *search_process_keyrings_aux(struct key_type *type,
-					const void *description,
-					key_match_func_t match)
+struct key *search_process_keyrings(struct key_type *type,
+				    const void *description,
+				    key_match_func_t match,
+				    struct task_struct *context)
 {
-	struct task_struct *tsk = current;
-	struct key *key, *ret, *err;
+	struct request_key_auth *rka;
+	struct key *key, *ret, *err, *instkey;
 
 	/* we want to return -EAGAIN or -ENOKEY if any of the keyrings were
 	 * searchable, but we failed to find a key or we found a negative key;
@@ -395,9 +396,9 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 	err = ERR_PTR(-EAGAIN);
 
 	/* search the thread keyring first */
-	if (tsk->thread_keyring) {
-		key = keyring_search_aux(tsk->thread_keyring, type,
-					 description, match);
+	if (context->thread_keyring) {
+		key = keyring_search_aux(context->thread_keyring,
+					 context, type, description, match);
 		if (!IS_ERR(key))
 			goto found;
 
@@ -415,9 +416,9 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 	}
 
 	/* search the process keyring second */
-	if (tsk->signal->process_keyring) {
-		key = keyring_search_aux(tsk->signal->process_keyring,
-					 type, description, match);
+	if (context->signal->process_keyring) {
+		key = keyring_search_aux(context->signal->process_keyring,
+					 context, type, description, match);
 		if (!IS_ERR(key))
 			goto found;
 
@@ -434,53 +435,93 @@ struct key *search_process_keyrings_aux(struct key_type *type,
 		}
 	}
 
-	/* search the session keyring last */
-	if (tsk->signal->session_keyring) {
+	/* search the session keyring */
+	if (context->signal->session_keyring) {
 		rcu_read_lock();
 		key = keyring_search_aux(
-			rcu_dereference(tsk->signal->session_keyring),
-			type, description, match);
+			rcu_dereference(context->signal->session_keyring),
+			context, type, description, match);
 		rcu_read_unlock();
+
+		if (!IS_ERR(key))
+			goto found;
+
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
+			break;
+		default:
+			err = key;
+			break;
+		}
+
+		/* if this process has a session keyring and that has an
+		 * instantiation authorisation key in the bottom level, then we
+		 * also search the keyrings of the process mentioned there */
+		if (context != current)
+			goto no_key;
+
+		rcu_read_lock();
+		instkey = __keyring_search_one(
+			rcu_dereference(context->signal->session_keyring),
+			&key_type_request_key_auth, NULL, 0);
+		rcu_read_unlock();
+
+		if (IS_ERR(instkey))
+			goto no_key;
+
+		rka = instkey->payload.data;
+
+		key = search_process_keyrings(type, description, match,
+					      rka->context);
+		key_put(instkey);
+
+		if (!IS_ERR(key))
+			goto found;
+
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
+			break;
+		default:
+			err = key;
+			break;
+		}
 	}
+	/* or search the user-session keyring */
 	else {
-		key = keyring_search_aux(tsk->user->session_keyring,
-					 type, description, match);
-	}
-
-	if (!IS_ERR(key))
-		goto found;
+		key = keyring_search_aux(context->user->session_keyring,
+					 context, type, description, match);
+		if (!IS_ERR(key))
+			goto found;
 
-	switch (PTR_ERR(key)) {
-	case -EAGAIN: /* no key */
-		if (ret)
+		switch (PTR_ERR(key)) {
+		case -EAGAIN: /* no key */
+			if (ret)
+				break;
+		case -ENOKEY: /* negative key */
+			ret = key;
 			break;
-	case -ENOKEY: /* negative key */
-		ret = key;
-		break;
-	default:
-		err = key;
-		break;
+		default:
+			err = key;
+			break;
+		}
 	}
 
+
+no_key:
 	/* no key - decide on the error we're going to go for */
 	key = ret ? ret : err;
 
- found:
+found:
 	return key;
 
-} /* end search_process_keyrings_aux() */
-
-/*****************************************************************************/
-/*
- * search the process keyrings for the first matching key
- * - we return -EAGAIN if we didn't find any matching key
- * - we return -ENOKEY if we found only negative matching keys
- */
-struct key *search_process_keyrings(struct key_type *type,
-				    const char *description)
-{
-	return search_process_keyrings_aux(type, description, type->match);
-
 } /* end search_process_keyrings() */
 
 /*****************************************************************************/
@@ -489,72 +530,73 @@ struct key *search_process_keyrings(struct key_type *type,
  * - don't create special keyrings unless so requested
  * - partially constructed keys aren't found unless requested
  */
-struct key *lookup_user_key(key_serial_t id, int create, int partial,
-			    key_perm_t perm)
+struct key *lookup_user_key(struct task_struct *context, key_serial_t id,
+			    int create, int partial, key_perm_t perm)
 {
-	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct key *key;
 	int ret;
 
+	if (!context)
+		context = current;
+
 	key = ERR_PTR(-ENOKEY);
 
 	switch (id) {
 	case KEY_SPEC_THREAD_KEYRING:
-		if (!tsk->thread_keyring) {
+		if (!context->thread_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_thread_keyring(tsk);
+			ret = install_thread_keyring(context);
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = tsk->thread_keyring;
+		key = context->thread_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_PROCESS_KEYRING:
-		if (!tsk->signal->process_keyring) {
+		if (!context->signal->process_keyring) {
 			if (!create)
 				goto error;
 
-			ret = install_process_keyring(tsk);
+			ret = install_process_keyring(context);
 			if (ret < 0) {
 				key = ERR_PTR(ret);
 				goto error;
 			}
 		}
 
-		key = tsk->signal->process_keyring;
+		key = context->signal->process_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_SESSION_KEYRING:
-		if (!tsk->signal->session_keyring) {
+		if (!context->signal->session_keyring) {
 			/* always install a session keyring upon access if one
 			 * doesn't exist yet */
 			ret = install_session_keyring(
-			       tsk, tsk->user->session_keyring);
+			       context, context->user->session_keyring);
 			if (ret < 0)
 				goto error;
 		}
 
-		spin_lock_irqsave(&tsk->sighand->siglock, flags);
-		key = tsk->signal->session_keyring;
+		rcu_read_lock();
+		key = rcu_dereference(context->signal->session_keyring);
 		atomic_inc(&key->usage);
-		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+		rcu_read_unlock();
 		break;
 
 	case KEY_SPEC_USER_KEYRING:
-		key = tsk->user->uid_keyring;
+		key = context->user->uid_keyring;
 		atomic_inc(&key->usage);
 		break;
 
 	case KEY_SPEC_USER_SESSION_KEYRING:
-		key = tsk->user->session_keyring;
+		key = context->user->session_keyring;
 		atomic_inc(&key->usage);
 		break;
 
@@ -574,7 +616,7 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 		break;
 	}
 
-	/* check the status and permissions */
+	/* check the status */
 	if (perm) {
 		ret = key_validate(key);
 		if (ret < 0)
@@ -585,8 +627,10 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 	if (!partial && !test_bit(KEY_FLAG_INSTANTIATED, &key->flags))
 		goto invalid_key;
 
+	/* check the permissions */
 	ret = -EACCES;
-	if (!key_permission(key, perm))
+
+	if (!key_task_permission(key, context, perm))
 		goto invalid_key;
 
  error:
@@ -609,7 +653,6 @@ struct key *lookup_user_key(key_serial_t id, int create, int partial,
 long join_session_keyring(const char *name)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct key *keyring;
 	long ret;
 
@@ -619,9 +662,9 @@ long join_session_keyring(const char *name)
 		if (ret < 0)
 			goto error;
 
-		spin_lock_irqsave(&tsk->sighand->siglock, flags);
-		ret = tsk->signal->session_keyring->serial;
-		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+		rcu_read_lock();
+		ret = rcu_dereference(tsk->signal->session_keyring)->serial;
+		rcu_read_unlock();
 		goto error;
 	}
 
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 54aa7b70e63b..dfcd983af1fd 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -1,6 +1,6 @@
 /* request_key.c: request a key from userspace
  *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2004-5 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/kmod.h>
 #include <linux/err.h>
+#include <linux/keyctl.h>
 #include "internal.h"
 
 struct key_construction {
@@ -27,18 +28,26 @@ DECLARE_WAIT_QUEUE_HEAD(request_key_conswq);
 /*
  * request userspace finish the construction of a key
  * - execute "/sbin/request-key <op> <key> <uid> <gid> <keyring> <keyring> <keyring> <info>"
- * - if callout_info is an empty string, it'll be rendered as a "-" instead
  */
 static int call_request_key(struct key *key,
 			    const char *op,
 			    const char *callout_info)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	key_serial_t prkey, sskey;
+	struct key *session_keyring, *rkakey;
 	char *argv[10], *envp[3], uid_str[12], gid_str[12];
 	char key_str[12], keyring_str[3][12];
-	int i;
+	int ret, i;
+
+	kenter("{%d},%s,%s", key->serial, op, callout_info);
+
+	/* generate a new session keyring with an auth key in it */
+	session_keyring = request_key_auth_new(key, &rkakey);
+	if (IS_ERR(session_keyring)) {
+		ret = PTR_ERR(session_keyring);
+		goto error;
+	}
 
 	/* record the UID and GID */
 	sprintf(uid_str, "%d", current->fsuid);
@@ -55,17 +64,17 @@ static int call_request_key(struct key *key,
 	if (tsk->signal->process_keyring)
 		prkey = tsk->signal->process_keyring->serial;
 
-	sskey = 0;
-	spin_lock_irqsave(&tsk->sighand->siglock, flags);
-	if (tsk->signal->session_keyring)
-		sskey = tsk->signal->session_keyring->serial;
-	spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
-
+	sprintf(keyring_str[1], "%d", prkey);
 
-	if (!sskey)
+	if (tsk->signal->session_keyring) {
+		rcu_read_lock();
+		sskey = rcu_dereference(tsk->signal->session_keyring)->serial;
+		rcu_read_unlock();
+	}
+	else {
 		sskey = tsk->user->session_keyring->serial;
+	}
 
-	sprintf(keyring_str[1], "%d", prkey);
 	sprintf(keyring_str[2], "%d", sskey);
 
 	/* set up a minimal environment */
@@ -84,11 +93,20 @@ static int call_request_key(struct key *key,
 	argv[i++] = keyring_str[0];
 	argv[i++] = keyring_str[1];
 	argv[i++] = keyring_str[2];
-	argv[i++] = callout_info[0] ? (char *) callout_info : "-";
+	argv[i++] = (char *) callout_info;
 	argv[i] = NULL;
 
 	/* do it */
-	return call_usermodehelper_keys(argv[0], argv, envp, NULL, 1);
+	ret = call_usermodehelper_keys(argv[0], argv, envp, session_keyring, 1);
+
+	/* dispose of the special keys */
+	key_revoke(rkakey);
+	key_put(rkakey);
+	key_put(session_keyring);
+
+ error:
+	kleave(" = %d", ret);
+	return ret;
 
 } /* end call_request_key() */
 
@@ -107,6 +125,8 @@ static struct key *__request_key_construction(struct key_type *type,
 	struct key *key;
 	int ret, negated;
 
+	kenter("%s,%s,%s", type->name, description, callout_info);
+
 	/* create a key and add it to the queue */
 	key = key_alloc(type, description,
 			current->fsuid, current->fsgid, KEY_USR_ALL, 0);
@@ -143,6 +163,7 @@ static struct key *__request_key_construction(struct key_type *type,
 	}
 
  out:
+	kleave(" = %p", key);
 	return key;
 
  request_failed:
@@ -216,6 +237,9 @@ static struct key *request_key_construction(struct key_type *type,
 
 	DECLARE_WAITQUEUE(myself, current);
 
+	kenter("%s,%s,{%d},%s",
+	       type->name, description, user->uid, callout_info);
+
 	/* see if there's such a key under construction already */
 	down_write(&key_construction_sem);
 
@@ -232,6 +256,7 @@ static struct key *request_key_construction(struct key_type *type,
 	/* see about getting userspace to construct the key */
 	key = __request_key_construction(type, description, callout_info);
  error:
+	kleave(" = %p", key);
 	return key;
 
 	/* someone else has the same key under construction
@@ -245,9 +270,11 @@ static struct key *request_key_construction(struct key_type *type,
 	add_wait_queue(&request_key_conswq, &myself);
 
 	for (;;) {
-		set_current_state(TASK_UNINTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 		if (!test_bit(KEY_FLAG_USER_CONSTRUCT, &ckey->flags))
 			break;
+		if (signal_pending(current))
+			break;
 		schedule();
 	}
 
@@ -265,23 +292,85 @@ static struct key *request_key_construction(struct key_type *type,
 
 } /* end request_key_construction() */
 
+/*****************************************************************************/
+/*
+ * link a freshly minted key to an appropriate destination keyring
+ */
+static void request_key_link(struct key *key, struct key *dest_keyring)
+{
+	struct task_struct *tsk = current;
+	struct key *drop = NULL;
+
+	kenter("{%d},%p", key->serial, dest_keyring);
+
+	/* find the appropriate keyring */
+	if (!dest_keyring) {
+		switch (tsk->jit_keyring) {
+		case KEY_REQKEY_DEFL_DEFAULT:
+		case KEY_REQKEY_DEFL_THREAD_KEYRING:
+			dest_keyring = tsk->thread_keyring;
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_PROCESS_KEYRING:
+			dest_keyring = tsk->signal->process_keyring;
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_SESSION_KEYRING:
+			rcu_read_lock();
+			dest_keyring = key_get(
+				rcu_dereference(tsk->signal->session_keyring));
+			rcu_read_unlock();
+			drop = dest_keyring;
+
+			if (dest_keyring)
+				break;
+
+		case KEY_REQKEY_DEFL_USER_SESSION_KEYRING:
+			dest_keyring = current->user->session_keyring;
+			break;
+
+		case KEY_REQKEY_DEFL_USER_KEYRING:
+			dest_keyring = current->user->uid_keyring;
+			break;
+
+		case KEY_REQKEY_DEFL_GROUP_KEYRING:
+		default:
+			BUG();
+		}
+	}
+
+	/* and attach the key to it */
+	key_link(dest_keyring, key);
+
+	key_put(drop);
+
+	kleave("");
+
+} /* end request_key_link() */
+
 /*****************************************************************************/
 /*
  * request a key
  * - search the process's keyrings
  * - check the list of keys being created or updated
- * - call out to userspace for a key if requested (supplementary info can be
- *   passed)
+ * - call out to userspace for a key if supplementary info was provided
+ * - cache the key in an appropriate keyring
  */
-struct key *request_key(struct key_type *type,
-			const char *description,
-			const char *callout_info)
+struct key *request_key_and_link(struct key_type *type,
+				 const char *description,
+				 const char *callout_info,
+				 struct key *dest_keyring)
 {
 	struct key_user *user;
 	struct key *key;
 
+	kenter("%s,%s,%s,%p",
+	       type->name, description, callout_info, dest_keyring);
+
 	/* search all the process keyrings for a key */
-	key = search_process_keyrings_aux(type, description, type->match);
+	key = search_process_keyrings(type, description, type->match, current);
 
 	if (PTR_ERR(key) == -EAGAIN) {
 		/* the search failed, but the keyrings were searchable, so we
@@ -292,12 +381,13 @@ struct key *request_key(struct key_type *type,
 
 		/* - get hold of the user's construction queue */
 		user = key_user_lookup(current->fsuid);
-		if (!user) {
-			key = ERR_PTR(-ENOMEM);
-			goto error;
-		}
+		if (!user)
+			goto nomem;
+
+		do {
+			if (signal_pending(current))
+				goto interrupted;
 
-		for (;;) {
 			/* ask userspace (returns NULL if it waited on a key
 			 * being constructed) */
 			key = request_key_construction(type, description,
@@ -307,18 +397,46 @@ struct key *request_key(struct key_type *type,
 
 			/* someone else made the key we want, so we need to
 			 * search again as it might now be available to us */
-			key = search_process_keyrings_aux(type, description,
-							  type->match);
-			if (PTR_ERR(key) != -EAGAIN)
-				break;
-		}
+			key = search_process_keyrings(type, description,
+						      type->match, current);
+
+		} while (PTR_ERR(key) == -EAGAIN);
 
 		key_user_put(user);
+
+		/* link the new key into the appropriate keyring */
+		if (!PTR_ERR(key))
+			request_key_link(key, dest_keyring);
 	}
 
- error:
+error:
+	kleave(" = %p", key);
 	return key;
 
+nomem:
+	key = ERR_PTR(-ENOMEM);
+	goto error;
+
+interrupted:
+	key_user_put(user);
+	key = ERR_PTR(-EINTR);
+	goto error;
+
+} /* end request_key_and_link() */
+
+/*****************************************************************************/
+/*
+ * request a key
+ * - search the process's keyrings
+ * - check the list of keys being created or updated
+ * - call out to userspace for a key if supplementary info was provided
+ */
+struct key *request_key(struct key_type *type,
+			const char *description,
+			const char *callout_info)
+{
+	return request_key_and_link(type, description, callout_info, NULL);
+
 } /* end request_key() */
 
 EXPORT_SYMBOL(request_key);
diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c
new file mode 100644
index 000000000000..f22264632229
--- /dev/null
+++ b/security/keys/request_key_auth.c
@@ -0,0 +1,180 @@
+/* request_key_auth.c: request key authorisation controlling key def
+ *
+ * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include "internal.h"
+
+static int request_key_auth_instantiate(struct key *, const void *, size_t);
+static void request_key_auth_describe(const struct key *, struct seq_file *);
+static void request_key_auth_destroy(struct key *);
+
+/*
+ * the request-key authorisation key type definition
+ */
+struct key_type key_type_request_key_auth = {
+	.name		= ".request_key_auth",
+	.def_datalen	= sizeof(struct request_key_auth),
+	.instantiate	= request_key_auth_instantiate,
+	.describe	= request_key_auth_describe,
+	.destroy	= request_key_auth_destroy,
+};
+
+/*****************************************************************************/
+/*
+ * instantiate a request-key authorisation record
+ */
+static int request_key_auth_instantiate(struct key *key,
+					const void *data,
+					size_t datalen)
+{
+	struct request_key_auth *rka, *irka;
+	struct key *instkey;
+	int ret;
+
+	ret = -ENOMEM;
+	rka = kmalloc(sizeof(*rka), GFP_KERNEL);
+	if (rka) {
+		/* see if the calling process is already servicing the key
+		 * request of another process */
+		instkey = key_get_instantiation_authkey(0);
+		if (!IS_ERR(instkey)) {
+			/* it is - use that instantiation context here too */
+			irka = instkey->payload.data;
+			rka->context = irka->context;
+			rka->pid = irka->pid;
+			key_put(instkey);
+		}
+		else {
+			/* it isn't - use this process as the context */
+			rka->context = current;
+			rka->pid = current->pid;
+		}
+
+		rka->target_key = key_get((struct key *) data);
+		key->payload.data = rka;
+		ret = 0;
+	}
+
+	return ret;
+
+} /* end request_key_auth_instantiate() */
+
+/*****************************************************************************/
+/*
+ *
+ */
+static void request_key_auth_describe(const struct key *key,
+				      struct seq_file *m)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	seq_puts(m, "key:");
+	seq_puts(m, key->description);
+	seq_printf(m, " pid:%d", rka->pid);
+
+} /* end request_key_auth_describe() */
+
+/*****************************************************************************/
+/*
+ * destroy an instantiation authorisation token key
+ */
+static void request_key_auth_destroy(struct key *key)
+{
+	struct request_key_auth *rka = key->payload.data;
+
+	kenter("{%d}", key->serial);
+
+	key_put(rka->target_key);
+
+} /* end request_key_auth_destroy() */
+
+/*****************************************************************************/
+/*
+ * create a session keyring to be for the invokation of /sbin/request-key and
+ * stick an authorisation token in it
+ */
+struct key *request_key_auth_new(struct key *target, struct key **_rkakey)
+{
+	struct key *keyring, *rkakey = NULL;
+	char desc[20];
+	int ret;
+
+	kenter("%d,", target->serial);
+
+	/* allocate a new session keyring */
+	sprintf(desc, "_req.%u", target->serial);
+
+	keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL);
+	if (IS_ERR(keyring)) {
+		kleave("= %ld", PTR_ERR(keyring));
+		return keyring;
+	}
+
+	/* allocate the auth key */
+	sprintf(desc, "%x", target->serial);
+
+	rkakey = key_alloc(&key_type_request_key_auth, desc,
+			   current->fsuid, current->fsgid,
+			   KEY_USR_VIEW, 1);
+	if (IS_ERR(rkakey)) {
+		key_put(keyring);
+		kleave("= %ld", PTR_ERR(rkakey));
+		return rkakey;
+	}
+
+	/* construct and attach to the keyring */
+	ret = key_instantiate_and_link(rkakey, target, 0, keyring, NULL);
+	if (ret < 0) {
+		key_revoke(rkakey);
+		key_put(rkakey);
+		key_put(keyring);
+		kleave("= %d", ret);
+		return ERR_PTR(ret);
+	}
+
+	*_rkakey = rkakey;
+	kleave(" = {%d} ({%d})", keyring->serial, rkakey->serial);
+	return keyring;
+
+} /* end request_key_auth_new() */
+
+/*****************************************************************************/
+/*
+ * get the authorisation key for instantiation of a specific key if attached to
+ * the current process's keyrings
+ * - this key is inserted into a keyring and that is set as /sbin/request-key's
+ *   session keyring
+ * - a target_id of zero specifies any valid token
+ */
+struct key *key_get_instantiation_authkey(key_serial_t target_id)
+{
+	struct task_struct *tsk = current;
+	struct key *instkey;
+
+	/* we must have our own personal session keyring */
+	if (!tsk->signal->session_keyring)
+		return ERR_PTR(-EACCES);
+
+	/* and it must contain a suitable request authorisation key
+	 * - lock RCU against session keyring changing
+	 */
+	rcu_read_lock();
+
+	instkey = keyring_search_instkey(
+		rcu_dereference(tsk->signal->session_keyring), target_id);
+
+	rcu_read_unlock();
+	return instkey;
+
+} /* end key_get_instantiation_authkey() */
-- 
cgit v1.2.3-59-g8ed1b


From 92198f7eaa5df3479341dd8fa20c2c81aa3b1e25 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 23 Jun 2005 22:00:59 -0700
Subject: [PATCH] pass iocb to dio_iodone_t

XFS will have to look at iocb->private to fix aio+dio.  No other filesystem
is using the blockdev_direct_IO* end_io callback.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/direct-io.c              | 2 +-
 fs/xfs/linux-2.6/xfs_aops.c | 3 ++-
 include/linux/fs.h          | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 1d55e7e67342..0d06097bc995 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -215,7 +215,7 @@ static struct page *dio_get_page(struct dio *dio)
 static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
 	if (dio->end_io && dio->result)
-		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+		dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
 	if (dio->lock_type == DIO_LOCKING)
 		up_read(&dio->inode->i_alloc_sem);
 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 93ce257cd149..a3a4b5aaf5d9 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -149,11 +149,12 @@ linvfs_unwritten_convert(
  */
 STATIC void
 linvfs_unwritten_convert_direct(
-	struct inode	*inode,
+	struct kiocb	*iocb,
 	loff_t		offset,
 	ssize_t		size,
 	void		*private)
 {
+	struct inode	*inode = iocb->ki_filp->f_dentry->d_inode;
 	ASSERT(!private || inode == (struct inode *)private);
 
 	/* private indicates an unwritten extent lay beneath this IO */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 517bf4966bf5..83857d8070d3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -220,6 +220,7 @@ extern int dir_notify_enable;
 
 struct iovec;
 struct nameidata;
+struct kiocb;
 struct pipe_inode_info;
 struct poll_table_struct;
 struct kstatfs;
@@ -240,7 +241,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
 typedef int (get_blocks_t)(struct inode *inode, sector_t iblock,
 			unsigned long max_blocks,
 			struct buffer_head *bh_result, int create);
-typedef void (dio_iodone_t)(struct inode *inode, loff_t offset,
+typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 			ssize_t bytes, void *private);
 
 /*
@@ -302,7 +303,6 @@ struct iattr {
 struct page;
 struct address_space;
 struct writeback_control;
-struct kiocb;
 
 struct address_space_operations {
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
-- 
cgit v1.2.3-59-g8ed1b


From 4e5117ba0af4582b6ec9164874f719d7f3f1eb2b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2005 22:01:03 -0700
Subject: [PATCH] quota: improve credits estimates

Improve estimates on the number of needed credits for quota transaction.
Now we distinguish blocks that might need to be allocated and blocks that
only need to be rewritten.  Also we distinguish deleting of a quota
structure and creating of a new one.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/dqblk_v1.h | 6 ++++++
 include/linux/dqblk_v2.h | 6 ++++++
 include/linux/quota.h    | 7 +++++--
 3 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dqblk_v1.h b/include/linux/dqblk_v1.h
index 42fbf4797156..57f1250d5a52 100644
--- a/include/linux/dqblk_v1.h
+++ b/include/linux/dqblk_v1.h
@@ -11,6 +11,12 @@
 /* Root squash turned on */
 #define V1_DQF_RSQUASH 1
 
+/* Numbers of blocks needed for updates */
+#define V1_INIT_ALLOC 1
+#define V1_INIT_REWRITE 1
+#define V1_DEL_ALLOC 0
+#define V1_DEL_REWRITE 2
+
 /* Special information about quotafile */
 struct v1_mem_dqinfo {
 };
diff --git a/include/linux/dqblk_v2.h b/include/linux/dqblk_v2.h
index 4a6c5f6867bb..4f853322cb7f 100644
--- a/include/linux/dqblk_v2.h
+++ b/include/linux/dqblk_v2.h
@@ -10,6 +10,12 @@
 /* id numbers of quota format */
 #define QFMT_VFS_V0 2
 
+/* Numbers of blocks needed for updates */
+#define V2_INIT_ALLOC 4
+#define V2_INIT_REWRITE 2
+#define V2_DEL_ALLOC 0
+#define V2_DEL_REWRITE 6
+
 /* Inmemory copy of version specific information */
 struct v2_mem_dqinfo {
 	unsigned int dqi_blocks;
diff --git a/include/linux/quota.h b/include/linux/quota.h
index ac5b90f4f256..700ead45084f 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -138,8 +138,11 @@ struct if_dqinfo {
 #include <linux/dqblk_v2.h>
 
 /* Maximal numbers of writes for quota operation (insert/delete/update)
- * (over all formats) - info block, 4 pointer blocks, data block */
-#define DQUOT_MAX_WRITES	6
+ * (over VFS all formats) */
+#define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC)
+#define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE)
+#define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC)
+#define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE)
 
 /*
  * Data for one user/group kept in memory
-- 
cgit v1.2.3-59-g8ed1b


From 1f54587bea84a35125c95e19b98c2f464c50871b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2005 22:01:04 -0700
Subject: [PATCH] quota: ext3: Improve quota credit estimates

Use improved credits estimates for quota operations.  Also reserve a space
for a quota operation in a transaction only if filesystem was mounted with
some quota options.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/acl.c            |  5 +++--
 fs/ext3/inode.c          |  7 ++++---
 fs/ext3/namei.c          | 25 +++++++++++++------------
 fs/ext3/super.c          | 37 +++++++++++++++++++++++++++----------
 fs/ext3/xattr.c          |  2 +-
 include/linux/ext3_fs.h  |  1 +
 include/linux/ext3_jbd.h | 19 +++++++++++--------
 7 files changed, 60 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index 133f5aa581bb..3ac38266fc9e 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -393,7 +393,8 @@ ext3_acl_chmod(struct inode *inode)
 		int retries = 0;
 
 	retry:
-		handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
+		handle = ext3_journal_start(inode,
+				EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			ext3_std_error(inode->i_sb, error);
@@ -503,7 +504,7 @@ ext3_xattr_set_acl(struct inode *inode, int type, const void *value,
 		acl = NULL;
 
 retry:
-	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	error = ext3_set_acl(handle, inode, type, acl);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0d5fa73b18dc..0b2db4f618cb 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -128,7 +128,7 @@ static unsigned long blocks_for_truncate(struct inode *inode)
 	if (needed > EXT3_MAX_TRANS_DATA) 
 		needed = EXT3_MAX_TRANS_DATA;
 
-	return EXT3_DATA_TRANS_BLOCKS + needed;
+	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
 }
 
 /* 
@@ -2763,7 +2763,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr)
 
 		/* (user+group)*(old+new) structure, inode write (sb,
 		 * inode block, ? - but truncate inode update has it) */
-		handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
+		handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
+					EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
 		if (IS_ERR(handle)) {
 			error = PTR_ERR(handle);
 			goto err_out;
@@ -2861,7 +2862,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode)
 #ifdef CONFIG_QUOTA
 	/* We know that structure was already allocated during DQUOT_INIT so
 	 * we will be updating only the data blocks + inodes */
-	ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
+	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
 #endif
 
 	return ret;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 60e44e6dd7a6..50378d8ff84b 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1645,9 +1645,9 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode,
 	int err, retries = 0;
 
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT3_QUOTA_INIT_BLOCKS);
+					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -1679,9 +1679,9 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry,
 		return -EINVAL;
 
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT3_QUOTA_INIT_BLOCKS);
+					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -1715,9 +1715,9 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode)
 		return -EMLINK;
 
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 +
-					2*EXT3_QUOTA_INIT_BLOCKS);
+					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2006,7 +2006,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry)
 	/* Initialize quotas before so that eventual writes go in
 	 * separate transaction */
 	DQUOT_INIT(dentry->d_inode);
-	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2065,7 +2065,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry)
 	/* Initialize quotas before so that eventual writes go
 	 * in separate transaction */
 	DQUOT_INIT(dentry->d_inode);
-	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS);
+	handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2120,9 +2120,9 @@ static int ext3_symlink (struct inode * dir,
 		return -ENAMETOOLONG;
 
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 +
-					2*EXT3_QUOTA_INIT_BLOCKS);
+					2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
@@ -2174,7 +2174,7 @@ static int ext3_link (struct dentry * old_dentry,
 		return -EMLINK;
 
 retry:
-	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
@@ -2216,7 +2216,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry,
 	 * in separate transaction */
 	if (new_dentry->d_inode)
 		DQUOT_INIT(new_dentry->d_inode);
-	handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS +
+	handle = ext3_journal_start(old_dir, 2 *
+					EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) +
 			 		EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2);
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9630fbfdc24a..b4b3e8a39131 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -589,7 +589,7 @@ enum {
 	Opt_commit, Opt_journal_update, Opt_journal_inum,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
-	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0,
+	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
 };
 
@@ -634,10 +634,10 @@ static match_table_t tokens = {
 	{Opt_grpjquota, "grpjquota=%s"},
 	{Opt_jqfmt_vfsold, "jqfmt=vfsold"},
 	{Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
-	{Opt_ignore, "grpquota"},
-	{Opt_ignore, "noquota"},
-	{Opt_ignore, "quota"},
-	{Opt_ignore, "usrquota"},
+	{Opt_quota, "grpquota"},
+	{Opt_noquota, "noquota"},
+	{Opt_quota, "quota"},
+	{Opt_quota, "usrquota"},
 	{Opt_barrier, "barrier=%u"},
 	{Opt_err, NULL},
 	{Opt_resize, "resize"},
@@ -876,6 +876,7 @@ set_qf_name:
 				sbi->s_qf_names[qtype] = NULL;
 				return 0;
 			}
+			set_opt(sbi->s_mount_opt, QUOTA);
 			break;
 		case Opt_offusrjquota:
 			qtype = USRQUOTA;
@@ -898,6 +899,17 @@ clear_qf_name:
 		case Opt_jqfmt_vfsv0:
 			sbi->s_jquota_fmt = QFMT_VFS_V0;
 			break;
+		case Opt_quota:
+			set_opt(sbi->s_mount_opt, QUOTA);
+			break;
+		case Opt_noquota:
+			if (sb_any_quota_enabled(sb)) {
+				printk(KERN_ERR "EXT3-fs: Cannot change quota "
+					"options when quota turned on.\n");
+				return 0;
+			}
+			clear_opt(sbi->s_mount_opt, QUOTA);
+			break;
 #else
 		case Opt_usrjquota:
 		case Opt_grpjquota:
@@ -909,6 +921,9 @@ clear_qf_name:
 				"EXT3-fs: journalled quota options not "
 				"supported.\n");
 			break;
+		case Opt_quota:
+		case Opt_noquota:
+			break;
 #endif
 		case Opt_abort:
 			set_opt(sbi->s_mount_opt, ABORT);
@@ -2238,7 +2253,7 @@ static int ext3_dquot_initialize(struct inode *inode, int type)
 	int ret, err;
 
 	/* We may create quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS);
+	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS(inode->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_initialize(inode, type);
@@ -2254,7 +2269,7 @@ static int ext3_dquot_drop(struct inode *inode)
 	int ret, err;
 
 	/* We may delete quota structure so we need to reserve enough blocks */
-	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_INIT_BLOCKS);
+	handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_drop(inode);
@@ -2272,7 +2287,7 @@ static int ext3_write_dquot(struct dquot *dquot)
 
 	inode = dquot_to_inode(dquot);
 	handle = ext3_journal_start(inode,
-					EXT3_QUOTA_TRANS_BLOCKS);
+					EXT3_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
@@ -2288,7 +2303,7 @@ static int ext3_acquire_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext3_journal_start(dquot_to_inode(dquot),
-					EXT3_QUOTA_INIT_BLOCKS);
+					EXT3_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
@@ -2304,7 +2319,7 @@ static int ext3_release_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext3_journal_start(dquot_to_inode(dquot),
-					EXT3_QUOTA_INIT_BLOCKS);
+					EXT3_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_release(dquot);
@@ -2361,6 +2376,8 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
 	int err;
 	struct nameidata nd;
 
+	if (!test_opt(sb, QUOTA))
+		return -EINVAL;
 	/* Not journalling quota? */
 	if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] &&
 	    !EXT3_SB(sb)->s_qf_names[GRPQUOTA])
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 4cbc6d0212d3..3f9dfa643b19 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1044,7 +1044,7 @@ ext3_xattr_set(struct inode *inode, int name_index, const char *name,
 	int error, retries = 0;
 
 retry:
-	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS);
+	handle = ext3_journal_start(inode, EXT3_DATA_TRANS_BLOCKS(inode->i_sb));
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
 	} else {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 74ad31781e3e..4b6e1ab216a5 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -358,6 +358,7 @@ struct ext3_inode {
 #define EXT3_MOUNT_RESERVATION		0x10000	/* Preallocation */
 #define EXT3_MOUNT_BARRIER		0x20000 /* Use block barriers */
 #define EXT3_MOUNT_NOBH			0x40000 /* No bufferheads */
+#define EXT3_MOUNT_QUOTA		0x80000 /* Some quota option set */
 
 /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
diff --git a/include/linux/ext3_jbd.h b/include/linux/ext3_jbd.h
index e8292af9033b..c8307c02dd07 100644
--- a/include/linux/ext3_jbd.h
+++ b/include/linux/ext3_jbd.h
@@ -42,15 +42,15 @@
  * superblock only gets updated once, of course, so don't bother
  * counting that again for the quota updates. */
 
-#define EXT3_DATA_TRANS_BLOCKS		(EXT3_SINGLEDATA_TRANS_BLOCKS + \
+#define EXT3_DATA_TRANS_BLOCKS(sb)	(EXT3_SINGLEDATA_TRANS_BLOCKS + \
 					 EXT3_XATTR_TRANS_BLOCKS - 2 + \
-					 2*EXT3_QUOTA_TRANS_BLOCKS)
+					 2*EXT3_QUOTA_TRANS_BLOCKS(sb))
 
 /* Delete operations potentially hit one directory's namespace plus an
  * entire inode, plus arbitrary amounts of bitmap/indirection data.  Be
  * generous.  We can grow the delete transaction later if necessary. */
 
-#define EXT3_DELETE_TRANS_BLOCKS	(2 * EXT3_DATA_TRANS_BLOCKS + 64)
+#define EXT3_DELETE_TRANS_BLOCKS(sb)	(2 * EXT3_DATA_TRANS_BLOCKS(sb) + 64)
 
 /* Define an arbitrary limit for the amount of data we will anticipate
  * writing to any given transaction.  For unbounded transactions such as
@@ -74,14 +74,17 @@
 #ifdef CONFIG_QUOTA
 /* Amount of blocks needed for quota update - we know that the structure was
  * allocated so we need to update only inode+data */
-#define EXT3_QUOTA_TRANS_BLOCKS 2
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
-#define EXT3_QUOTA_INIT_BLOCKS (DQUOT_MAX_WRITES*\
-				(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3)
+#define EXT3_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
+		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+#define EXT3_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
+		(EXT3_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
-#define EXT3_QUOTA_TRANS_BLOCKS 0
-#define EXT3_QUOTA_INIT_BLOCKS 0
+#define EXT3_QUOTA_TRANS_BLOCKS(sb) 0
+#define EXT3_QUOTA_INIT_BLOCKS(sb) 0
+#define EXT3_QUOTA_DEL_BLOCKS(sb) 0
 #endif
 
 int
-- 
cgit v1.2.3-59-g8ed1b


From 556a2a45bce1740f035befaa7201e4ad836c7257 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 23 Jun 2005 22:01:06 -0700
Subject: [PATCH] quota: reiserfs: improve quota credit estimates

Use improved credits estimates for quota operations.  Also reserve space
for a quota operation in a transaction only if filesystem was mounted with
some quota option.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/file.c             |  4 ++--
 fs/reiserfs/inode.c            | 11 ++++++-----
 fs/reiserfs/namei.c            | 25 ++++++++++++++-----------
 fs/reiserfs/super.c            | 35 +++++++++++++++++++++++------------
 include/linux/reiserfs_fs.h    | 15 +++++++++++----
 include/linux/reiserfs_fs_sb.h |  2 ++
 6 files changed, 58 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 2230afff1870..12e91209544e 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -201,7 +201,7 @@ static int reiserfs_allocate_blocks_for_region(
     /* If we came here, it means we absolutely need to open a transaction,
        since we need to allocate some blocks */
     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS); // Wish I know if this number enough
+    res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb)); // Wish I know if this number enough
     if (res)
         goto error_exit;
     reiserfs_update_inode_transaction(inode) ;
@@ -576,7 +576,7 @@ error_exit:
         int err;
         // update any changes we made to blk count
         reiserfs_update_sd(th, inode);
-        err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS);
+        err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
         if (err)
             res = err;
     }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 073425e6e0a9..0d5817f81972 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -28,7 +28,7 @@ static int reiserfs_prepare_write(struct file *f, struct page *page,
 void reiserfs_delete_inode (struct inode * inode)
 {
     /* We need blocks for transaction + (user+group) quota update (possibly delete) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS;
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
     struct reiserfs_transaction_handle th ;
   
     reiserfs_write_lock(inode->i_sb);
@@ -591,7 +591,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
        XXX in practically impossible worst case direct2indirect()
        can incur (much) more than 3 balancings.
        quota update for user, group */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
     int version;
     int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
@@ -2796,14 +2796,15 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
 
                 if (!error) {
 		    struct reiserfs_transaction_handle th;
+		    int jbegin_count = 2*(REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb)+REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb))+2;
 
 		    /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
-		    error = journal_begin(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+		    error = journal_begin(&th, inode->i_sb, jbegin_count);
  		    if (error)
  			goto out;
                     error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
 		    if (error) {
-			journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+			journal_end(&th, inode->i_sb, jbegin_count);
 			goto out;
 		    }
 		    /* Update corresponding info in inode so that everything is in
@@ -2813,7 +2814,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
 		    if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		    mark_inode_dirty(inode);
-		    error = journal_end(&th, inode->i_sb, 4*REISERFS_QUOTA_INIT_BLOCKS+2);
+		    error = journal_end(&th, inode->i_sb, jbegin_count);
 		}
         }
         if (!error)
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 7d4dc5f5aa8b..4a333255f27a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -586,7 +586,7 @@ static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
     int retval;
     struct inode * inode;
     /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
     struct reiserfs_transaction_handle th ;
     int locked;
 
@@ -653,7 +653,7 @@ static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode,
     struct inode * inode;
     struct reiserfs_transaction_handle th ;
     /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
     int locked;
 
     if (!new_valid_dev(rdev))
@@ -727,7 +727,7 @@ static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode)
     struct inode * inode;
     struct reiserfs_transaction_handle th ;
     /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
     int locked;
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
@@ -829,8 +829,10 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
 
 
     /* we will be doing 2 balancings and update 2 stat data, we change quotas
-     * of the owner of the directory and of the owner of the parent directory */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+     * of the owner of the directory and of the owner of the parent directory.
+     * The quota structure is possibly deleted only on last iput => outside
+     * of this transaction */
+    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
     reiserfs_write_lock(dir->i_sb);
     retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
@@ -913,9 +915,10 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
     inode = dentry->d_inode;
 
     /* in this transaction we can be doing at max two balancings and update
-       two stat datas, we change quotas of the owner of the directory and of
-       the owner of the parent directory */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+     * two stat datas, we change quotas of the owner of the directory and of
+     * the owner of the parent directory. The quota structure is possibly
+     * deleted only on iput => outside of this transaction */
+    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
     reiserfs_write_lock(dir->i_sb);
     retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
@@ -1000,7 +1003,7 @@ static int reiserfs_symlink (struct inode * parent_dir,
     struct reiserfs_transaction_handle th ;
     int mode = S_IFLNK | S_IRWXUGO;
     /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS+REISERFS_QUOTA_TRANS_BLOCKS);
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
 
     if (!(inode = new_inode(parent_dir->i_sb))) {
 	return -ENOMEM ;
@@ -1076,7 +1079,7 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
     struct inode *inode = old_dentry->d_inode;
     struct reiserfs_transaction_handle th ;
     /* We need blocks for transaction + update of quotas for the owners of the directory */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS;
+    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
 
     reiserfs_write_lock(dir->i_sb);
     if (inode->i_nlink >= REISERFS_LINK_MAX) {
@@ -1196,7 +1199,7 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
        pointed initially and (5) maybe block containing ".." of
        renamed directory
        quota updates: two parent directories */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS;
+    jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
 
     old_inode = old_dentry->d_inode;
     new_dentry_inode = new_dentry->d_inode;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 031577fb41a1..660aefca1fd2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -866,8 +866,9 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 	{"jdev",	.arg_required = 'j', .values = NULL},
 	{"nolargeio",	.arg_required = 'w', .values = NULL},
 	{"commit",	.arg_required = 'c', .values = NULL},
-	{"usrquota",},
-	{"grpquota",},
+	{"usrquota",	.setmask = 1<<REISERFS_QUOTA},
+	{"grpquota",	.setmask = 1<<REISERFS_QUOTA},
+	{"noquota",	.clrmask = 1<<REISERFS_QUOTA},
 	{"errors", 	.arg_required = 'e', .values = error_actions},
 	{"usrjquota",	.arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
 	{"grpjquota",	.arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
@@ -964,6 +965,7 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 		    return 0;
 		}
 		strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+		*mount_options |= 1<<REISERFS_QUOTA;
 	    }
 	    else {
 		if (REISERFS_SB(s)->s_qf_names[qtype]) {
@@ -995,7 +997,13 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 	reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified.");
 	return 0;
     }
+    /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
+    if (!(*mount_options & (1<<REISERFS_QUOTA)) && sb_any_quota_enabled(s)) {
+	reiserfs_warning(s, "reiserfs_parse_options: quota options must be present when quota is turned on.");
+	return 0;
+    }
 #endif
+
     return 1;
 }
 
@@ -1105,6 +1113,7 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
   safe_mask |= 1 << REISERFS_ERROR_RO;
   safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
   safe_mask |= 1 << REISERFS_ERROR_PANIC;
+  safe_mask |= 1 << REISERFS_QUOTA;
 
   /* Update the bitmask, taking care to keep
    * the bits we're not allowed to change here */
@@ -1845,11 +1854,11 @@ static int reiserfs_dquot_initialize(struct inode *inode, int type)
 
     /* We may create quota structure so we need to reserve enough blocks */
     reiserfs_write_lock(inode->i_sb);
-    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
+    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
     if (ret)
 	goto out;
     ret = dquot_initialize(inode, type);
-    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
+    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
     if (!ret && err)
 	ret = err;
 out:
@@ -1864,11 +1873,11 @@ static int reiserfs_dquot_drop(struct inode *inode)
 
     /* We may delete quota structure so we need to reserve enough blocks */
     reiserfs_write_lock(inode->i_sb);
-    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
+    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
     if (ret)
  	goto out;
     ret = dquot_drop(inode);
-    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS);
+    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
     if (!ret && err)
 	ret = err;
 out:
@@ -1882,11 +1891,11 @@ static int reiserfs_write_dquot(struct dquot *dquot)
     int ret, err;
 
     reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS);
+    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
     if (ret)
 	goto out;
     ret = dquot_commit(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS);
+    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
     if (!ret && err)
 	ret = err;
 out:
@@ -1900,11 +1909,11 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
     int ret, err;
 
     reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
+    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
     if (ret)
 	goto out;
     ret = dquot_acquire(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
+    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
     if (!ret && err)
 	ret = err;
 out:
@@ -1918,11 +1927,11 @@ static int reiserfs_release_dquot(struct dquot *dquot)
     int ret, err;
 
     reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
+    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
     if (ret)
  	goto out;
     ret = dquot_release(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS);
+    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
     if (!ret && err)
 	ret = err;
 out:
@@ -1978,6 +1987,8 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, ch
     int err;
     struct nameidata nd;
 
+    if (!(REISERFS_SB(sb)->s_mount_opt & (1<<REISERFS_QUOTA)))
+	return -EINVAL;
     err = path_lookup(path, LOOKUP_FOLLOW, &nd);
     if (err)
         return err;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 32148625fc2f..4c7c5689ad93 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1644,11 +1644,18 @@ struct reiserfs_journal_header {
 #define JOURNAL_MAX_TRANS_AGE 30
 #define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
 #ifdef CONFIG_QUOTA
-#define REISERFS_QUOTA_TRANS_BLOCKS 2	/* We need to update data and inode (atime) */
-#define REISERFS_QUOTA_INIT_BLOCKS (DQUOT_MAX_WRITES*(JOURNAL_PER_BALANCE_CNT+2)+1)	/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
+/* We need to update data and inode (atime) */
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & (1<<REISERFS_QUOTA) ? 2 : 0)
+/* 1 balancing, 1 bitmap, 1 data per write + stat data update */
+#define REISERFS_QUOTA_INIT_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & (1<<REISERFS_QUOTA) ? \
+(DQUOT_INIT_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_INIT_REWRITE+1) : 0)
+/* same as with INIT */
+#define REISERFS_QUOTA_DEL_BLOCKS(s) (REISERFS_SB(s)->s_mount_opt & (1<<REISERFS_QUOTA) ? \
+(DQUOT_DEL_ALLOC*(JOURNAL_PER_BALANCE_CNT+2)+DQUOT_DEL_REWRITE+1) : 0)
 #else
-#define REISERFS_QUOTA_TRANS_BLOCKS 0
-#define REISERFS_QUOTA_INIT_BLOCKS 0
+#define REISERFS_QUOTA_TRANS_BLOCKS(s) 0
+#define REISERFS_QUOTA_INIT_BLOCKS(s) 0
+#define REISERFS_QUOTA_DEL_BLOCKS(s) 0
 #endif
 
 /* both of these can be as low as 1, or as high as you want.  The min is the
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 37a3a7afbec7..31c709d0fe18 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -467,6 +467,8 @@ enum reiserfs_mount_options {
     REISERFS_ERROR_RO,
     REISERFS_ERROR_CONTINUE,
 
+    REISERFS_QUOTA,		/* Some quota option specified */
+
     REISERFS_TEST1,
     REISERFS_TEST2,
     REISERFS_TEST3,
-- 
cgit v1.2.3-59-g8ed1b


From 3b6259432dee81f928c22c48c080d5f6325ed92e Mon Sep 17 00:00:00 2001
From: Corey Minyard <minyard@acm.org>
Date: Thu, 23 Jun 2005 22:01:42 -0700
Subject: [PATCH] ipmi: add power cycle capability

This patch to adds "power cycle" functionality to the IPMI power off module
ipmi_poweroff.  It also contains changes to support procfs control of the
feature.

The power cycle action is considered an optional chassis control in the IPMI
specification.  However, it is definitely useful when the hardware supports
it.  A power cycle is usually required in order to reset a firmware in a bad
state.  This action is critical to allow remote management of servers.

The implementation adds power cycle as optional to the ipmi_poweroff module.
It can be modified dynamically through the proc entry mentioned above.  During
a power down and enabled, the power cycle command is sent to the BMC firmware.
 If it fails either due to non-support or some error, it will retry to send
the command as power off.

Signed-off-by: Christopher A. Poblete <Chris_Poblete@dell.com>
Signed-off-by: Corey Minyard <minyard@acm.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/IPMI.txt              |  20 +++++++
 drivers/char/ipmi/ipmi_msghandler.c |  29 +++++++++-
 drivers/char/ipmi/ipmi_poweroff.c   | 112 +++++++++++++++++++++++++++++++++---
 include/linux/ipmi.h                |   5 ++
 4 files changed, 155 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 2f99fe6299ab..84d3d4d10c17 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -594,3 +594,23 @@ an event generator, the event receiver from the local management
 controller will be queried and the events sent to the SEL on that
 device.  Otherwise, the events go nowhere since there is nowhere to
 send them.
+
+
+Poweroff
+--------
+
+If the poweroff capability is selected, the IPMI driver will install
+a shutdown function into the standard poweroff function pointer.  This
+is in the ipmi_poweroff module.  When the system requests a powerdown,
+it will send the proper IPMI commands to do this.  This is supported on
+several platforms.
+
+There is a module parameter named "poweroff_control" that may either be zero
+(do a power down) or 2 (do a power cycle, power the system off, then power
+it on in a few seconds).  Setting ipmi_poweroff.poweroff_control=x will do
+the same thing on the kernel command line.  The parameter is also available
+via the proc filesystem in /proc/ipmi/poweroff_control.  Note that if the
+system does not support power cycling, it will always to the power off.
+
+Note that if you have ACPI enabled, the system will prefer using ACPI to
+power off.
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ed75e96d0035..1813d0d198f1 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -54,7 +54,9 @@ static int ipmi_init_msghandler(void);
 
 static int initialized = 0;
 
-static struct proc_dir_entry *proc_ipmi_root = NULL;
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_ipmi_root = NULL;
+#endif /* CONFIG_PROC_FS */
 
 #define MAX_EVENTS_IN_QUEUE	25
 
@@ -124,11 +126,13 @@ struct ipmi_channel
 	unsigned char protocol;
 };
 
+#ifdef CONFIG_PROC_FS
 struct ipmi_proc_entry
 {
 	char                   *name;
 	struct ipmi_proc_entry *next;
 };
+#endif
 
 #define IPMI_IPMB_NUM_SEQ	64
 #define IPMI_MAX_CHANNELS       8
@@ -156,10 +160,13 @@ struct ipmi_smi
 	struct ipmi_smi_handlers *handlers;
 	void                     *send_info;
 
+#ifdef CONFIG_PROC_FS
 	/* A list of proc entries for this interface.  This does not
 	   need a lock, only one thread creates it and only one thread
 	   destroys it. */
+	spinlock_t             proc_entry_lock;
 	struct ipmi_proc_entry *proc_entries;
+#endif
 
 	/* A table of sequence numbers for this interface.  We use the
            sequence numbers for IPMB messages that go out of the
@@ -1470,8 +1477,9 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 			    read_proc_t *read_proc, write_proc_t *write_proc,
 			    void *data, struct module *owner)
 {
-	struct proc_dir_entry  *file;
 	int                    rv = 0;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry  *file;
 	struct ipmi_proc_entry *entry;
 
 	/* Create a list element. */
@@ -1497,10 +1505,13 @@ int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
 		file->write_proc = write_proc;
 		file->owner = owner;
 
+		spin_lock(&smi->proc_entry_lock);
 		/* Stick it on the list. */
 		entry->next = smi->proc_entries;
 		smi->proc_entries = entry;
+		spin_unlock(&smi->proc_entry_lock);
 	}
+#endif /* CONFIG_PROC_FS */
 
 	return rv;
 }
@@ -1509,6 +1520,7 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 {
 	int rv = 0;
 
+#ifdef CONFIG_PROC_FS
 	sprintf(smi->proc_dir_name, "%d", num);
 	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
 	if (!smi->proc_dir)
@@ -1531,14 +1543,17 @@ static int add_proc_entries(ipmi_smi_t smi, int num)
 		rv = ipmi_smi_add_proc_entry(smi, "version",
 					     version_file_read_proc, NULL,
 					     smi, THIS_MODULE);
+#endif /* CONFIG_PROC_FS */
 
 	return rv;
 }
 
 static void remove_proc_entries(ipmi_smi_t smi)
 {
+#ifdef CONFIG_PROC_FS
 	struct ipmi_proc_entry *entry;
 
+	spin_lock(&smi->proc_entry_lock);
 	while (smi->proc_entries) {
 		entry = smi->proc_entries;
 		smi->proc_entries = entry->next;
@@ -1547,7 +1562,9 @@ static void remove_proc_entries(ipmi_smi_t smi)
 		kfree(entry->name);
 		kfree(entry);
 	}
+	spin_unlock(&smi->proc_entry_lock);
 	remove_proc_entry(smi->proc_dir_name, proc_ipmi_root);
+#endif /* CONFIG_PROC_FS */
 }
 
 static int
@@ -1694,6 +1711,9 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 				new_intf->seq_table[j].seqid = 0;
 			}
 			new_intf->curr_seq = 0;
+#ifdef CONFIG_PROC_FS
+			spin_lock_init(&(new_intf->proc_entry_lock));
+#endif
 			spin_lock_init(&(new_intf->waiting_msgs_lock));
 			INIT_LIST_HEAD(&(new_intf->waiting_msgs));
 			spin_lock_init(&(new_intf->events_lock));
@@ -3085,6 +3105,7 @@ static int ipmi_init_msghandler(void)
 		ipmi_interfaces[i] = NULL;
 	}
 
+#ifdef CONFIG_PROC_FS
 	proc_ipmi_root = proc_mkdir("ipmi", NULL);
 	if (!proc_ipmi_root) {
 	    printk(KERN_ERR PFX "Unable to create IPMI proc dir");
@@ -3092,6 +3113,7 @@ static int ipmi_init_msghandler(void)
 	}
 
 	proc_ipmi_root->owner = THIS_MODULE;
+#endif /* CONFIG_PROC_FS */
 
 	init_timer(&ipmi_timer);
 	ipmi_timer.data = 0;
@@ -3129,7 +3151,9 @@ static __exit void cleanup_ipmi(void)
 	atomic_inc(&stop_operation);
 	del_timer_sync(&ipmi_timer);
 
+#ifdef CONFIG_PROC_FS
 	remove_proc_entry(proc_ipmi_root->name, &proc_root);
+#endif /* CONFIG_PROC_FS */
 
 	initialized = 0;
 
@@ -3170,4 +3194,5 @@ EXPORT_SYMBOL(ipmi_get_my_address);
 EXPORT_SYMBOL(ipmi_set_my_LUN);
 EXPORT_SYMBOL(ipmi_get_my_LUN);
 EXPORT_SYMBOL(ipmi_smi_add_proc_entry);
+EXPORT_SYMBOL(proc_ipmi_root);
 EXPORT_SYMBOL(ipmi_user_set_run_to_completion);
diff --git a/drivers/char/ipmi/ipmi_poweroff.c b/drivers/char/ipmi/ipmi_poweroff.c
index cb5cdc6f14bf..61329b55c4a9 100644
--- a/drivers/char/ipmi/ipmi_poweroff.c
+++ b/drivers/char/ipmi/ipmi_poweroff.c
@@ -34,6 +34,8 @@
 #include <asm/semaphore.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/proc_fs.h>
 #include <linux/string.h>
 #include <linux/ipmi.h>
 #include <linux/ipmi_smi.h>
@@ -44,6 +46,18 @@
 /* Where to we insert our poweroff function? */
 extern void (*pm_power_off)(void);
 
+/* Definitions for controlling power off (if the system supports it).  It
+ * conveniently matches the IPMI chassis control values. */
+#define IPMI_CHASSIS_POWER_DOWN		0	/* power down, the default. */
+#define IPMI_CHASSIS_POWER_CYCLE	0x02	/* power cycle */
+
+/* the IPMI data command */
+static int poweroff_control = IPMI_CHASSIS_POWER_DOWN;
+
+/* parameter definition to allow user to flag power cycle */
+module_param(poweroff_control, int, IPMI_CHASSIS_POWER_DOWN);
+MODULE_PARM_DESC(poweroff_control, " Set to 2 to enable power cycle instead of power down. Power cycle is contingent on hardware support, otherwise it defaults back to power down.");
+
 /* Stuff from the get device id command. */
 static unsigned int mfg_id;
 static unsigned int prod_id;
@@ -349,26 +363,38 @@ static void ipmi_poweroff_chassis (ipmi_user_t user)
         smi_addr.channel = IPMI_BMC_CHANNEL;
         smi_addr.lun = 0;
 
-	printk(KERN_INFO PFX "Powering down via IPMI chassis control command\n");
+ powercyclefailed:
+	printk(KERN_INFO PFX "Powering %s via IPMI chassis control command\n",
+		((poweroff_control != IPMI_CHASSIS_POWER_CYCLE) ? "down" : "cycle"));
 
 	/*
 	 * Power down
 	 */
 	send_msg.netfn = IPMI_NETFN_CHASSIS_REQUEST;
 	send_msg.cmd = IPMI_CHASSIS_CONTROL_CMD;
-	data[0] = 0; /* Power down */
+	data[0] = poweroff_control;
 	send_msg.data = data;
 	send_msg.data_len = sizeof(data);
 	rv = ipmi_request_in_rc_mode(user,
 				     (struct ipmi_addr *) &smi_addr,
 				     &send_msg);
 	if (rv) {
-		printk(KERN_ERR PFX "Unable to send chassis powerdown message,"
-		       " IPMI error 0x%x\n", rv);
-		goto out;
+		switch (poweroff_control) {
+			case IPMI_CHASSIS_POWER_CYCLE:
+				/* power cycle failed, default to power down */
+				printk(KERN_ERR PFX "Unable to send chassis power " \
+					"cycle message, IPMI error 0x%x\n", rv);
+				poweroff_control = IPMI_CHASSIS_POWER_DOWN;
+				goto powercyclefailed;
+
+			case IPMI_CHASSIS_POWER_DOWN:
+			default:
+				printk(KERN_ERR PFX "Unable to send chassis power " \
+					"down message, IPMI error 0x%x\n", rv);
+				break;
+		}
 	}
 
- out:
 	return;
 }
 
@@ -430,7 +456,8 @@ static void ipmi_po_new_smi(int if_num)
 	if (ready)
 		return;
 
-	rv = ipmi_create_user(if_num, &ipmi_poweroff_handler, NULL, &ipmi_user);
+	rv = ipmi_create_user(if_num, &ipmi_poweroff_handler, NULL,
+			      &ipmi_user);
 	if (rv) {
 		printk(KERN_ERR PFX "could not create IPMI user, error %d\n",
 		       rv);
@@ -509,21 +536,84 @@ static struct ipmi_smi_watcher smi_watcher =
 };
 
 
+#ifdef CONFIG_PROC_FS
+/* displays properties to proc */
+static int proc_read_chassctrl(char *page, char **start, off_t off, int count,
+			       int *eof, void *data)
+{
+	return sprintf(page, "%d\t[ 0=powerdown 2=powercycle ]\n",
+			poweroff_control);
+}
+
+/* process property writes from proc */
+static int proc_write_chassctrl(struct file *file, const char *buffer,
+			        unsigned long count, void *data)
+{
+	int          rv = count;
+	unsigned int newval = 0;
+
+	sscanf(buffer, "%d", &newval);
+	switch (newval) {
+		case IPMI_CHASSIS_POWER_CYCLE:
+			printk(KERN_INFO PFX "power cycle is now enabled\n");
+			poweroff_control = newval;
+			break;
+
+		case IPMI_CHASSIS_POWER_DOWN:
+			poweroff_control = IPMI_CHASSIS_POWER_DOWN;
+			break;
+
+		default:
+			rv = -EINVAL;
+			break;
+	}
+
+	return rv;
+}
+#endif /* CONFIG_PROC_FS */
+
 /*
  * Startup and shutdown functions.
  */
 static int ipmi_poweroff_init (void)
 {
-	int rv;
+	int                   rv;
+	struct proc_dir_entry *file;
 
 	printk ("Copyright (C) 2004 MontaVista Software -"
 		" IPMI Powerdown via sys_reboot version "
 		IPMI_POWEROFF_VERSION ".\n");
 
+	switch (poweroff_control) {
+		case IPMI_CHASSIS_POWER_CYCLE:
+			printk(KERN_INFO PFX "Power cycle is enabled.\n");
+			break;
+
+		case IPMI_CHASSIS_POWER_DOWN:
+		default:
+			poweroff_control = IPMI_CHASSIS_POWER_DOWN;
+			break;
+	}
+
 	rv = ipmi_smi_watcher_register(&smi_watcher);
-	if (rv)
+	if (rv) {
 		printk(KERN_ERR PFX "Unable to register SMI watcher: %d\n", rv);
+		goto out_err;
+	}
+
+#ifdef CONFIG_PROC_FS
+	file = create_proc_entry("poweroff_control", 0, proc_ipmi_root);
+	if (!file) {
+		printk(KERN_ERR PFX "Unable to create proc power control\n");
+	} else {
+		file->nlink = 1;
+		file->read_proc = proc_read_chassctrl;
+		file->write_proc = proc_write_chassctrl;
+		file->owner = THIS_MODULE;
+	}
+#endif
 
+ out_err:
 	return rv;
 }
 
@@ -532,6 +622,10 @@ static __exit void ipmi_poweroff_cleanup(void)
 {
 	int rv;
 
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("poweroff_control", proc_ipmi_root);
+#endif
+
 	ipmi_smi_watcher_unregister(&smi_watcher);
 
 	if (ready) {
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 2ec265e1045f..596ca6130159 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -209,6 +209,11 @@ struct kernel_ipmi_msg
 #include <linux/list.h>
 #include <linux/module.h>
 
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+extern struct proc_dir_entry *proc_ipmi_root;
+#endif /* CONFIG_PROC_FS */
+
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
 typedef struct ipmi_user *ipmi_user_t;
-- 
cgit v1.2.3-59-g8ed1b


From a6df7da8f7ee99e6fd1995fad852bacb978a6447 Mon Sep 17 00:00:00 2001
From: Kylene Hall <kjhall@us.ibm.com>
Date: Thu, 23 Jun 2005 22:02:04 -0700
Subject: [PATCH] tpm: TPMs on additional LPC bus

Add support for TPMs on additional LPC buses.

Signed-off-by: Kylene Hall <kjhall@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/tpm/tpm_atmel.c | 1 +
 include/linux/pci_ids.h      | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/tpm/tpm_atmel.c b/drivers/char/tpm/tpm_atmel.c
index 68974577a6a6..13248400b9c3 100644
--- a/drivers/char/tpm/tpm_atmel.c
+++ b/drivers/char/tpm/tpm_atmel.c
@@ -205,6 +205,7 @@ static struct pci_device_id tpm_pci_tbl[] __devinitdata = {
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801DB_12)},
 	{PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801EB_0)},
 	{PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_8111_LPC)},
+	{PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6LPC)},
 	{0,}
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 63e89e47b8e9..bf608808a60c 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1568,6 +1568,7 @@
 #define PCI_DEVICE_ID_SERVERWORKS_OSB4USB 0x0220
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5USB PCI_DEVICE_ID_SERVERWORKS_OSB4USB
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6USB 0x0221
+#define PCI_DEVICE_ID_SERVERWORKS_CSB6LPC 0x0227
 #define PCI_DEVICE_ID_SERVERWORKS_GCLE    0x0225
 #define PCI_DEVICE_ID_SERVERWORKS_GCLE2   0x0227
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5ISA 0x0230
-- 
cgit v1.2.3-59-g8ed1b


From 61fbfa8129c1771061a0e9f47747854293081c5b Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:11 -0700
Subject: [PATCH] I2O: bugfixes and compability enhancements

Changes:

 - Fixed sysfs bug where user and parent links where added to the I2O
   device itself
 - Fixed bug when calculating TID for the event handler and cleaned up the
   workflow of i2o_driver_dispatch()
 - Fixed oops when no I2O device could be found for an event delivered to
   Exec-OSM
 - Fixed initialization of spinlock in Exec-OSM
 - Fixed memory leak in i2o_cfg_passthru() and i2o_cfg_passthru()
 - Removed MTRR support
 - Added PCI ID of Promise SX6000 with firmware >= 1.20.x.x
 - Turn of caching for ioremapped memory of in_queue
 - Added initialization sequence for Promise controllers
 - Moved definition of u8 / u16 / u32 for raidutils before first use

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/device.c     | 10 +++--
 drivers/message/i2o/driver.c     | 89 +++++++++++++++++++-------------------
 drivers/message/i2o/exec-osm.c   |  9 ++--
 drivers/message/i2o/i2o_config.c | 48 +++++++++++++--------
 drivers/message/i2o/i2o_scsi.c   |  3 +-
 drivers/message/i2o/pci.c        | 93 ++++++++++++++--------------------------
 include/linux/i2o-dev.h          | 16 +++----
 include/linux/i2o.h              |  5 ---
 8 files changed, 124 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index eb907e87bc7b..280627ae6cf7 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -401,25 +401,27 @@ static int i2o_device_class_add(struct class_device *cd)
 
 	/* create user entries for this device */
 	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.user_tid);
-	if (tmp)
+	if (tmp && (tmp != i2o_dev))
 		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
 				  "user");
 
 	/* create user entries refering to this device */
 	list_for_each_entry(tmp, &c->devices, list)
-	    if (tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+	    if ((tmp->lct_data.user_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev))
 		sysfs_create_link(&tmp->device.kobj,
 				  &i2o_dev->device.kobj, "user");
 
 	/* create parent entries for this device */
 	tmp = i2o_iop_find_device(i2o_dev->iop, i2o_dev->lct_data.parent_tid);
-	if (tmp)
+	if (tmp && (tmp != i2o_dev))
 		sysfs_create_link(&i2o_dev->device.kobj, &tmp->device.kobj,
 				  "parent");
 
 	/* create parent entries refering to this device */
 	list_for_each_entry(tmp, &c->devices, list)
-	    if (tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+	    if ((tmp->lct_data.parent_tid == i2o_dev->lct_data.tid)
+		&& (tmp != i2o_dev))
 		sysfs_create_link(&tmp->device.kobj,
 				  &i2o_dev->device.kobj, "parent");
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 91f4edbb2a27..c71e68f70e7d 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -18,6 +18,8 @@
 #include <linux/rwsem.h>
 #include <linux/i2o.h>
 
+#define OSM_NAME	"core"
+
 /* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
 unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
 module_param_named(max_drivers, i2o_max_drivers, uint, 0);
@@ -182,62 +184,59 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m,
 	struct i2o_driver *drv;
 	u32 context = readl(&msg->u.s.icntxt);
 
-	if (likely(context < i2o_max_drivers)) {
-		spin_lock(&i2o_drivers_lock);
-		drv = i2o_drivers[context];
-		spin_unlock(&i2o_drivers_lock);
-
-		if (unlikely(!drv)) {
-			printk(KERN_WARNING "%s: Spurious reply to unknown "
-			       "driver %d\n", c->name, context);
-			return -EIO;
-		}
+	if (unlikely(context >= i2o_max_drivers)) {
+		printk(KERN_WARNING "%s: Spurious reply to unknown driver "
+		       "%d\n", c->name, readl(&msg->u.s.icntxt));
+		return -EIO;
+	}
 
-		if ((readl(&msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
-			struct i2o_device *dev, *tmp;
-			struct i2o_event *evt;
-			u16 size;
-			u16 tid;
+	spin_lock(&i2o_drivers_lock);
+	drv = i2o_drivers[context];
+	spin_unlock(&i2o_drivers_lock);
 
-			tid = readl(&msg->u.head[1]) & 0x1fff;
+	if (unlikely(!drv)) {
+		osm_warn("Spurious reply to unknown driver %d\n", context);
+		return -EIO;
+	}
 
-			pr_debug("%s: event received from device %d\n", c->name,
-				 tid);
+	if ((readl(&msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
+		struct i2o_device *dev, *tmp;
+		struct i2o_event *evt;
+		u16 size;
+		u16 tid = readl(&msg->u.head[1]) & 0xfff;
 
-			/* cut of header from message size (in 32-bit words) */
-			size = (readl(&msg->u.head[0]) >> 16) - 5;
+		osm_debug("event received from device %d\n", tid);
 
-			evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC);
-			if (!evt)
-				return -ENOMEM;
-			memset(evt, 0, size * 4 + sizeof(*evt));
+		/* cut of header from message size (in 32-bit words) */
+		size = (readl(&msg->u.head[0]) >> 16) - 5;
 
-			evt->size = size;
-			memcpy_fromio(&evt->tcntxt, &msg->u.s.tcntxt,
-				      (size + 2) * 4);
+		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC | __GFP_ZERO);
+		if (!evt)
+			return -ENOMEM;
 
-			list_for_each_entry_safe(dev, tmp, &c->devices, list)
-			    if (dev->lct_data.tid == tid) {
-				evt->i2o_dev = dev;
-				break;
-			}
+		evt->size = size;
+		evt->tcntxt = readl(&msg->u.s.tcntxt);
+		evt->event_indicator = readl(&msg->body[0]);
+		memcpy_fromio(&evt->tcntxt, &msg->u.s.tcntxt, size * 4);
 
-			INIT_WORK(&evt->work, (void (*)(void *))drv->event,
-				  evt);
-			queue_work(drv->event_queue, &evt->work);
-			return 1;
+		list_for_each_entry_safe(dev, tmp, &c->devices, list)
+		    if (dev->lct_data.tid == tid) {
+			evt->i2o_dev = dev;
+			break;
 		}
 
-		if (likely(drv->reply))
-			return drv->reply(c, m, msg);
-		else
-			pr_debug("%s: Reply to driver %s, but no reply function"
-				 " defined!\n", c->name, drv->name);
+		INIT_WORK(&evt->work, (void (*)(void *))drv->event, evt);
+		queue_work(drv->event_queue, &evt->work);
+		return 1;
+	}
+
+	if (unlikely(!drv->reply)) {
+		pr_debug("%s: Reply to driver %s, but no reply function"
+			 " defined!\n", c->name, drv->name);
 		return -EIO;
-	} else
-		printk(KERN_WARNING "%s: Spurious reply to unknown driver "
-		       "%d\n", c->name, readl(&msg->u.s.icntxt));
-	return -EIO;
+	}
+
+	return drv->reply(c, m, msg);
 }
 
 /**
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 79c1cbfb8f44..1e28e886f1ca 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -204,12 +204,10 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 				      struct i2o_message __iomem *msg)
 {
 	struct i2o_exec_wait *wait, *tmp;
-	static spinlock_t lock;
+	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 	int rc = 1;
 	u32 context;
 
-	spin_lock_init(&lock);
-
 	context = readl(&msg->u.s.tcntxt);
 
 	/*
@@ -381,8 +379,9 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
  */
 static void i2o_exec_event(struct i2o_event *evt)
 {
-	osm_info("Event received from device: %d\n",
-		 evt->i2o_dev->lct_data.tid);
+	if(likely(evt->i2o_dev))
+		osm_info("Event received from device: %d\n",
+			 evt->i2o_dev->lct_data.tid);
 	kfree(evt);
 };
 
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 1fb5cdf67f8f..46d373287a30 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -555,6 +555,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 	u32 sg_offset = 0;
 	u32 sg_count = 0;
 	u32 i = 0;
+	u32 sg_index = 0;
 	i2o_status_block *sb;
 	struct i2o_message *msg;
 	u32 m;
@@ -634,8 +635,8 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 		if (sg_count > SG_TABLESIZE) {
 			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
 			       c->name, sg_count);
-			kfree(reply);
-			return -EINVAL;
+			rcode = -EINVAL;
+			goto cleanup;
 		}
 
 		for (i = 0; i < sg_count; i++) {
@@ -651,7 +652,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 				goto cleanup;
 			}
 			sg_size = sg[i].flag_count & 0xffffff;
-			p = &(sg_list[i]);
+			p = &(sg_list[sg_index++]);
 			/* Allocate memory for the transfer */
 			if (i2o_dma_alloc
 			    (&c->pdev->dev, p, sg_size,
@@ -660,7 +661,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
 				       c->name, sg_size, i, sg_count);
 				rcode = -ENOMEM;
-				goto cleanup;
+				goto sg_list_cleanup;
 			}
 			/* Copy in the user's SG buffer if necessary */
 			if (sg[i].
@@ -673,7 +674,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 					       "%s: Could not copy SG buf %d FROM user\n",
 					       c->name, i);
 					rcode = -EFAULT;
-					goto cleanup;
+					goto sg_list_cleanup;
 				}
 			}
 			//TODO 64bit fix
@@ -683,10 +684,10 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 
 	rcode = i2o_msg_post_wait(c, m, 60);
 	if (rcode)
-		goto cleanup;
+		goto sg_list_cleanup;
 
 	if (sg_offset) {
-		u32 msg[128];
+		u32 msg[MSG_FRAME_SIZE];
 		/* Copy back the Scatter Gather buffers back to user space */
 		u32 j;
 		// TODO 64bit fix
@@ -698,14 +699,14 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 		// get user msg size in u32s
 		if (get_user(size, &user_msg[0])) {
 			rcode = -EFAULT;
-			goto cleanup;
+			goto sg_list_cleanup;
 		}
 		size = size >> 16;
 		size *= 4;
 		/* Copy in the user's I2O command */
 		if (copy_from_user(msg, user_msg, size)) {
 			rcode = -EFAULT;
-			goto cleanup;
+			goto sg_list_cleanup;
 		}
 		sg_count =
 		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
@@ -727,7 +728,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 					       c->name, sg_list[j].virt,
 					       sg[j].addr_bus);
 					rcode = -EFAULT;
-					goto cleanup;
+					goto sg_list_cleanup;
 				}
 			}
 		}
@@ -741,6 +742,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 			       "%s: Could not copy message context FROM user\n",
 			       c->name);
 			rcode = -EFAULT;
+			goto sg_list_cleanup;
 		}
 		if (copy_to_user(user_reply, reply, reply_size)) {
 			printk(KERN_WARNING
@@ -749,6 +751,10 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 		}
 	}
 
+      sg_list_cleanup:
+	for (i = 0; i < sg_index; i++)
+		i2o_dma_free(&c->pdev->dev, &sg_list[i]);
+
       cleanup:
 	kfree(reply);
 	return rcode;
@@ -862,8 +868,8 @@ static int i2o_cfg_passthru(unsigned long arg)
 		if (sg_count > SG_TABLESIZE) {
 			printk(KERN_DEBUG "%s:IOCTL SG List too large (%u)\n",
 			       c->name, sg_count);
-			kfree(reply);
-			return -EINVAL;
+			rcode = -EINVAL;
+			goto cleanup;
 		}
 
 		for (i = 0; i < sg_count; i++) {
@@ -875,7 +881,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 				       "%s:Bad SG element %d - not simple (%x)\n",
 				       c->name, i, sg[i].flag_count);
 				rcode = -EINVAL;
-				goto cleanup;
+				goto sg_list_cleanup;
 			}
 			sg_size = sg[i].flag_count & 0xffffff;
 			/* Allocate memory for the transfer */
@@ -885,7 +891,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 				       "%s: Could not allocate SG buffer - size = %d buffer number %d of %d\n",
 				       c->name, sg_size, i, sg_count);
 				rcode = -ENOMEM;
-				goto cleanup;
+				goto sg_list_cleanup;
 			}
 			sg_list[sg_index++] = p;	// sglist indexed with input frame, not our internal frame.
 			/* Copy in the user's SG buffer if necessary */
@@ -899,7 +905,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 					       "%s: Could not copy SG buf %d FROM user\n",
 					       c->name, i);
 					rcode = -EFAULT;
-					goto cleanup;
+					goto sg_list_cleanup;
 				}
 			}
 			//TODO 64bit fix
@@ -909,7 +915,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 
 	rcode = i2o_msg_post_wait(c, m, 60);
 	if (rcode)
-		goto cleanup;
+		goto sg_list_cleanup;
 
 	if (sg_offset) {
 		u32 msg[128];
@@ -924,14 +930,14 @@ static int i2o_cfg_passthru(unsigned long arg)
 		// get user msg size in u32s
 		if (get_user(size, &user_msg[0])) {
 			rcode = -EFAULT;
-			goto cleanup;
+			goto sg_list_cleanup;
 		}
 		size = size >> 16;
 		size *= 4;
 		/* Copy in the user's I2O command */
 		if (copy_from_user(msg, user_msg, size)) {
 			rcode = -EFAULT;
-			goto cleanup;
+			goto sg_list_cleanup;
 		}
 		sg_count =
 		    (size - sg_offset * 4) / sizeof(struct sg_simple_element);
@@ -953,7 +959,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 					       c->name, sg_list[j],
 					       sg[j].addr_bus);
 					rcode = -EFAULT;
-					goto cleanup;
+					goto sg_list_cleanup;
 				}
 			}
 		}
@@ -975,6 +981,10 @@ static int i2o_cfg_passthru(unsigned long arg)
 		}
 	}
 
+      sg_list_cleanup:
+	for (i = 0; i < sg_index; i++)
+		kfree(sg_list[i]);
+
       cleanup:
 	kfree(reply);
 	return rcode;
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 43f5875e0be5..af40f1c1ec77 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -103,7 +103,8 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 
 	list_for_each_entry(i2o_dev, &c->devices, list)
 	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER_PORT) {
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1) || (type == 1))	/* SCSI bus */
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		   && (type == 0x01))	/* SCSI bus */
 			max_channel++;
 	}
 
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index e772752f056d..579a8b7a2120 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -31,10 +31,6 @@
 #include <linux/interrupt.h>
 #include <linux/i2o.h>
 
-#ifdef CONFIG_MTRR
-#include <asm/mtrr.h>
-#endif				// CONFIG_MTRR
-
 /* Module internal functions from other sources */
 extern struct i2o_controller *i2o_iop_alloc(void);
 extern void i2o_iop_free(struct i2o_controller *);
@@ -49,6 +45,8 @@ extern int i2o_driver_dispatch(struct i2o_controller *, u32,
 static struct pci_device_id __devinitdata i2o_pci_ids[] = {
 	{PCI_DEVICE_CLASS(PCI_CLASS_INTELLIGENT_I2O << 8, 0xffff00)},
 	{PCI_DEVICE(PCI_VENDOR_ID_DPT, 0xa511)},
+	{.vendor = PCI_VENDOR_ID_INTEL,.device = 0x1962,
+	 .subvendor = PCI_VENDOR_ID_PROMISE,.subdevice = PCI_ANY_ID},
 	{0}
 };
 
@@ -97,13 +95,6 @@ static void i2o_pci_free(struct i2o_controller *c)
 	i2o_dma_free(dev, &c->hrt);
 	i2o_dma_free(dev, &c->status);
 
-#ifdef CONFIG_MTRR
-	if (c->mtrr_reg0 >= 0)
-		mtrr_del(c->mtrr_reg0, 0, 0);
-	if (c->mtrr_reg1 >= 0)
-		mtrr_del(c->mtrr_reg1, 0, 0);
-#endif
-
 	if (c->raptor && c->in_queue.virt)
 		iounmap(c->in_queue.virt);
 
@@ -178,14 +169,15 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 		       c->name, (unsigned long)c->base.phys,
 		       (unsigned long)c->base.len);
 
-	c->base.virt = ioremap(c->base.phys, c->base.len);
+	c->base.virt = ioremap_nocache(c->base.phys, c->base.len);
 	if (!c->base.virt) {
 		printk(KERN_ERR "%s: Unable to map controller.\n", c->name);
 		return -ENOMEM;
 	}
 
 	if (c->raptor) {
-		c->in_queue.virt = ioremap(c->in_queue.phys, c->in_queue.len);
+		c->in_queue.virt =
+		    ioremap_nocache(c->in_queue.phys, c->in_queue.len);
 		if (!c->in_queue.virt) {
 			printk(KERN_ERR "%s: Unable to map controller.\n",
 			       c->name);
@@ -199,40 +191,6 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 	c->post_port = c->base.virt + 0x40;
 	c->reply_port = c->base.virt + 0x44;
 
-#ifdef CONFIG_MTRR
-	/* Enable Write Combining MTRR for IOP's memory region */
-	c->mtrr_reg0 = mtrr_add(c->in_queue.phys, c->in_queue.len,
-				MTRR_TYPE_WRCOMB, 1);
-	c->mtrr_reg1 = -1;
-
-	if (c->mtrr_reg0 < 0)
-		printk(KERN_WARNING "%s: could not enable write combining "
-		       "MTRR\n", c->name);
-	else
-		printk(KERN_INFO "%s: using write combining MTRR\n", c->name);
-
-	/*
-	 * If it is an INTEL i960 I/O processor then set the first 64K to
-	 * Uncacheable since the region contains the messaging unit which
-	 * shouldn't be cached.
-	 */
-	if ((pdev->vendor == PCI_VENDOR_ID_INTEL ||
-	     pdev->vendor == PCI_VENDOR_ID_DPT) && !c->raptor) {
-		printk(KERN_INFO "%s: MTRR workaround for Intel i960 processor"
-		       "\n", c->name);
-		c->mtrr_reg1 = mtrr_add(c->base.phys, 0x10000,
-					MTRR_TYPE_UNCACHABLE, 1);
-
-		if (c->mtrr_reg1 < 0) {
-			printk(KERN_WARNING "%s: Error in setting "
-			       "MTRR_TYPE_UNCACHABLE\n", c->name);
-			mtrr_del(c->mtrr_reg0, c->in_queue.phys,
-				 c->in_queue.len);
-			c->mtrr_reg0 = -1;
-		}
-	}
-#endif
-
 	if (i2o_dma_alloc(dev, &c->status, 8, GFP_KERNEL)) {
 		i2o_pci_free(c);
 		return -ENOMEM;
@@ -385,28 +343,25 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 {
 	struct i2o_controller *c;
 	int rc;
+	struct pci_dev *i960 = NULL;
 
 	printk(KERN_INFO "i2o: Checking for PCI I2O controllers...\n");
 
 	if ((pdev->class & 0xff) > 1) {
-		printk(KERN_WARNING "i2o: I2O controller found but does not "
-		       "support I2O 1.5 (skipping).\n");
+		printk(KERN_WARNING "i2o: %s does not support I2O 1.5 "
+		       "(skipping).\n", pci_name(pdev));
 		return -ENODEV;
 	}
 
 	if ((rc = pci_enable_device(pdev))) {
-		printk(KERN_WARNING "i2o: I2O controller found but could not be"
-		       " enabled.\n");
+		printk(KERN_WARNING "i2o: couldn't enable device %s\n",
+		       pci_name(pdev));
 		return rc;
 	}
 
-	printk(KERN_INFO "i2o: I2O controller found on bus %d at %d.\n",
-	       pdev->bus->number, pdev->devfn);
-
 	if (pci_set_dma_mask(pdev, DMA_32BIT_MASK)) {
-		printk(KERN_WARNING "i2o: I2O controller on bus %d at %d: No "
-		       "suitable DMA available!\n", pdev->bus->number,
-		       pdev->devfn);
+		printk(KERN_WARNING "i2o: no suitable DMA found for %s\n",
+		       pci_name(pdev));
 		rc = -ENODEV;
 		goto disable;
 	}
@@ -415,11 +370,13 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 
 	c = i2o_iop_alloc();
 	if (IS_ERR(c)) {
-		printk(KERN_ERR "i2o: memory for I2O controller could not be "
-		       "allocated\n");
+		printk(KERN_ERR "i2o: couldn't allocate memory for %s\n",
+		       pci_name(pdev));
 		rc = PTR_ERR(c);
 		goto disable;
-	}
+	} else
+		printk(KERN_INFO "%s: controller found (%s)\n", c->name,
+		       pci_name(pdev));
 
 	c->pdev = pdev;
 	c->device = pdev->dev;
@@ -432,9 +389,18 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	}
 
 	if (pdev->subsystem_vendor == PCI_VENDOR_ID_PROMISE) {
+		/*
+		 * Expose the ship behind i960 for initialization, or it will
+		 * failed
+		 */
+		i960 =
+		    pci_find_slot(c->pdev->bus->number,
+				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
+
+		if (i960)
+			pci_write_config_word(i960, 0x42, 0);
+
 		c->promise = 1;
-		printk(KERN_INFO "%s: Promise workarounds activated.\n",
-		       c->name);
 	}
 
 	/* Cards that go bananas if you quiesce them before you reset them. */
@@ -459,6 +425,9 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	if ((rc = i2o_iop_add(c)))
 		goto uninstall;
 
+	if (i960)
+		pci_write_config_word(i960, 0x42, 0x03ff);
+
 	return 0;
 
       uninstall:
diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index ef7f644dd873..3414325bdcfd 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -24,6 +24,14 @@
 #define MAX_I2O_CONTROLLERS	32
 
 //#include <linux/ioctl.h>
+#ifndef __KERNEL__
+
+typedef unsigned char u8;
+typedef unsigned short u16;
+typedef unsigned int u32;
+
+#endif				/* __KERNEL__ */
+
 
 /*
  * I2O Control IOCTLs and structures
@@ -126,14 +134,6 @@ struct i2o_evt_get {
 #define I2O_BUS_CARDBUS 7
 #define I2O_BUS_UNKNOWN 0x80
 
-#ifndef __KERNEL__
-
-typedef unsigned char u8;
-typedef unsigned short u16;
-typedef unsigned int u32;
-
-#endif				/* __KERNEL__ */
-
 typedef struct _i2o_pci_bus {
 	u8 PciFunctionNumber;
 	u8 PciDeviceNumber;
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index ea9a3ad4b67f..40e45a83d3fb 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -152,11 +152,6 @@ struct i2o_controller {
 	unsigned int raptor:1;		/* split bar */
 	unsigned int promise:1;		/* Promise controller */
 
-#ifdef CONFIG_MTRR
-	int mtrr_reg0;
-	int mtrr_reg1;
-#endif
-
 	struct list_head devices;	/* list of I2O devices */
 
 	struct notifier_block *event_notifer;	/* Events */
-- 
cgit v1.2.3-59-g8ed1b


From f88e119c4b824a5017456fa094950d0f4092d96c Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:14 -0700
Subject: [PATCH] I2O: first code cleanup of spare warnings and unused
 functions

Changes:

 - Removed unnecessary checking of NULL before calling kfree()
 - Make some functions static
 - Changed pr_debug() into osm_debug()
 - Use i2o_msg_in_to_virt() for getting a pointer to the message frame
 - Cleaned up some comments
 - Changed some le32_to_cpu() into readl() where necessary
 - Make error messages of OSM's look the same
 - Cleaned up error handling in i2o_block_end_request()
 - Removed unused error handling of failed messages in Block-OSM, which
   are not allowed by the I2O spec
 - Corrected the blocksize detection in i2o_block
 - Added hrt and lct sysfs-attribute to controller
 - Call done() function in SCSI-OSM after freeing DMA buffers
 - Removed unneeded variable for message size calculation in
   i2o_scsi_queuecommand()
 - Make some changes to remove sparse warnings
 - Reordered some functions
 - Cleaned up controller initialization
 - Replaced some magic numbers by defines
 - Removed unnecessary dma_sync_single_for_cpu() call on coherent DMA
 - Removed some unused fields in i2o_controller and removed some unused
   functions

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/device.c     |   9 +-
 drivers/message/i2o/driver.c     |  46 +++++----
 drivers/message/i2o/exec-osm.c   |  47 +++++----
 drivers/message/i2o/i2o_block.c  | 211 +++++++++++++--------------------------
 drivers/message/i2o/i2o_block.h  |   2 +-
 drivers/message/i2o/i2o_config.c | 118 +++++++++++++++++++++-
 drivers/message/i2o/i2o_scsi.c   |  31 +++---
 drivers/message/i2o/iop.c        |  87 ++++++++++------
 drivers/message/i2o/pci.c        |  67 ++++++-------
 include/linux/i2o.h              |  74 +++-----------
 10 files changed, 356 insertions(+), 336 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 280627ae6cf7..f1b7eb63d54b 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -282,8 +282,7 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 
 	down(&c->lct_lock);
 
-	if (c->lct)
-		kfree(c->lct);
+	kfree(c->lct);
 
 	lct = c->dlct.virt;
 
@@ -447,8 +446,8 @@ static struct class_interface i2o_device_class_interface = {
  *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
  */
 
-int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
-		   int oplen, void *reslist, int reslen)
+static int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
+			  int oplen, void *reslist, int reslen)
 {
 	struct i2o_message __iomem *msg;
 	u32 m;
@@ -540,7 +539,7 @@ int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
 		opblk[4] = -1;
 
 	size = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
-			      sizeof(opblk), resblk, sizeof(resblk));
+			      sizeof(opblk), resblk, buflen + 8);
 
 	memcpy(buf, resblk + 8, buflen);	/* cut off header */
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index c71e68f70e7d..bebdd509b5d8 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -18,7 +18,7 @@
 #include <linux/rwsem.h>
 #include <linux/i2o.h>
 
-#define OSM_NAME	"core"
+#define OSM_NAME	"i2o"
 
 /* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
 unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
@@ -78,17 +78,16 @@ int i2o_driver_register(struct i2o_driver *drv)
 	int rc = 0;
 	unsigned long flags;
 
-	pr_debug("i2o: Register driver %s\n", drv->name);
+	osm_debug("Register driver %s\n", drv->name);
 
 	if (drv->event) {
 		drv->event_queue = create_workqueue(drv->name);
 		if (!drv->event_queue) {
-			printk(KERN_ERR "i2o: Could not initialize event queue "
-			       "for driver %s\n", drv->name);
+			osm_err("Could not initialize event queue for driver "
+				"%s\n", drv->name);
 			return -EFAULT;
 		}
-		pr_debug("i2o: Event queue initialized for driver %s\n",
-			 drv->name);
+		osm_debug("Event queue initialized for driver %s\n", drv->name);
 	} else
 		drv->event_queue = NULL;
 
@@ -99,8 +98,8 @@ int i2o_driver_register(struct i2o_driver *drv)
 
 	for (i = 0; i2o_drivers[i]; i++)
 		if (i >= i2o_max_drivers) {
-			printk(KERN_ERR "i2o: too many drivers registered, "
-			       "increase max_drivers\n");
+			osm_err("too many drivers registered, increase "
+				"max_drivers\n");
 			spin_unlock_irqrestore(&i2o_drivers_lock, flags);
 			return -EFAULT;
 		}
@@ -110,8 +109,7 @@ int i2o_driver_register(struct i2o_driver *drv)
 
 	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
 
-	pr_debug("i2o: driver %s gets context id %d\n", drv->name,
-		 drv->context);
+	osm_debug("driver %s gets context id %d\n", drv->name, drv->context);
 
 	list_for_each_entry(c, &i2o_controllers, list) {
 		struct i2o_device *i2o_dev;
@@ -141,7 +139,7 @@ void i2o_driver_unregister(struct i2o_driver *drv)
 	struct i2o_controller *c;
 	unsigned long flags;
 
-	pr_debug("i2o: unregister driver %s\n", drv->name);
+	osm_debug("unregister driver %s\n", drv->name);
 
 	driver_unregister(&drv->driver);
 
@@ -161,7 +159,7 @@ void i2o_driver_unregister(struct i2o_driver *drv)
 	if (drv->event_queue) {
 		destroy_workqueue(drv->event_queue);
 		drv->event_queue = NULL;
-		pr_debug("i2o: event queue removed for %s\n", drv->name);
+		osm_debug("event queue removed for %s\n", drv->name);
 	}
 };
 
@@ -178,15 +176,15 @@ void i2o_driver_unregister(struct i2o_driver *drv)
  *	on success and if the message should be flushed afterwords. Returns
  *	negative error code on failure (the message will be flushed too).
  */
-int i2o_driver_dispatch(struct i2o_controller *c, u32 m,
-			struct i2o_message __iomem *msg)
+int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 {
 	struct i2o_driver *drv;
+	struct i2o_message __iomem *msg = i2o_msg_out_to_virt(c, m);
 	u32 context = readl(&msg->u.s.icntxt);
 
 	if (unlikely(context >= i2o_max_drivers)) {
-		printk(KERN_WARNING "%s: Spurious reply to unknown driver "
-		       "%d\n", c->name, readl(&msg->u.s.icntxt));
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
 		return -EIO;
 	}
 
@@ -195,7 +193,8 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m,
 	spin_unlock(&i2o_drivers_lock);
 
 	if (unlikely(!drv)) {
-		osm_warn("Spurious reply to unknown driver %d\n", context);
+		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
+			 context);
 		return -EIO;
 	}
 
@@ -207,6 +206,9 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m,
 
 		osm_debug("event received from device %d\n", tid);
 
+		if (!drv->event)
+			return -EIO;
+
 		/* cut of header from message size (in 32-bit words) */
 		size = (readl(&msg->u.head[0]) >> 16) - 5;
 
@@ -231,8 +233,8 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m,
 	}
 
 	if (unlikely(!drv->reply)) {
-		pr_debug("%s: Reply to driver %s, but no reply function"
-			 " defined!\n", c->name, drv->name);
+		osm_debug("%s: Reply to driver %s, but no reply function"
+			  " defined!\n", c->name, drv->name);
 		return -EIO;
 	}
 
@@ -333,11 +335,11 @@ int __init i2o_driver_init(void)
 	if ((i2o_max_drivers < 2) || (i2o_max_drivers > 64) ||
 	    ((i2o_max_drivers ^ (i2o_max_drivers - 1)) !=
 	     (2 * i2o_max_drivers - 1))) {
-		printk(KERN_WARNING "i2o: max_drivers set to %d, but must be "
-		       ">=2 and <= 64 and a power of 2\n", i2o_max_drivers);
+		osm_warn("max_drivers set to %d, but must be >=2 and <= 64 and "
+			 "a power of 2\n", i2o_max_drivers);
 		i2o_max_drivers = I2O_MAX_DRIVERS;
 	}
-	printk(KERN_INFO "i2o: max drivers = %d\n", i2o_max_drivers);
+	osm_info("max drivers = %d\n", i2o_max_drivers);
 
 	i2o_drivers =
 	    kmalloc(i2o_max_drivers * sizeof(*i2o_drivers), GFP_KERNEL);
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 1e28e886f1ca..5581344fbba6 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -108,7 +108,8 @@ static void i2o_exec_wait_free(struct i2o_exec_wait *wait)
  *	buffer must not be freed. Instead the event completion will free them
  *	for you. In all other cases the buffer are your problem.
  *
- *	Returns 0 on success or negative error code on failure.
+ *	Returns 0 on success, negative error code on timeout or positive error
+ *	code from reply.
  */
 int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 			  timeout, struct i2o_dma *dma)
@@ -116,7 +117,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	struct i2o_exec_wait *wait;
 	static u32 tcntxt = 0x80000000;
-	struct i2o_message __iomem *msg = c->in_queue.virt + m;
+	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
 	int rc = 0;
 
 	wait = i2o_exec_wait_alloc();
@@ -161,8 +162,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 	barrier();
 
 	if (wait->complete) {
-		if (readl(&wait->msg->body[0]) >> 24)
-			rc = readl(&wait->msg->body[0]) & 0xff;
+		rc = readl(&wait->msg->body[0]) >> 24;
 		i2o_flush_reply(c, wait->m);
 		i2o_exec_wait_free(wait);
 	} else {
@@ -187,6 +187,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
  *	@c: I2O controller which answers
  *	@m: message id
  *	@msg: pointer to the I2O reply message
+ *	@context: transaction context of request
  *
  *	This function is called in interrupt context only. If the reply reached
  *	before the timeout, the i2o_exec_wait struct is filled with the message
@@ -201,14 +202,12 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
  *	message must also be given back to the controller.
  */
 static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
-				      struct i2o_message __iomem *msg)
+				      struct i2o_message __iomem *msg,
+				      u32 context)
 {
 	struct i2o_exec_wait *wait, *tmp;
 	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 	int rc = 1;
-	u32 context;
-
-	context = readl(&msg->u.s.tcntxt);
 
 	/*
 	 * We need to search through the i2o_exec_wait_list to see if the given
@@ -251,7 +250,7 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 
 	spin_unlock(&lock);
 
-	pr_debug("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
+	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
 		 context);
 
 	return -1;
@@ -321,29 +320,35 @@ static void i2o_exec_lct_modified(struct i2o_controller *c)
  *	code on failure and if the reply should be flushed.
  */
 static int i2o_exec_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message *msg)
+			  struct i2o_message __iomem *msg)
 {
-	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {	// Fail bit is set
-		struct i2o_message __iomem *pmsg;	/* preserved message */
+	u32 context;
+
+	if (readl(&msg->u.head[0]) & MSG_FAIL) {
+		/*
+		 * If Fail bit is set we must take the transaction context of
+		 * the preserved message to find the right request again.
+		 */
+		struct i2o_message __iomem *pmsg;
 		u32 pm;
 
-		pm = le32_to_cpu(msg->body[3]);
+		pm = readl(&msg->body[3]);
 
 		pmsg = i2o_msg_in_to_virt(c, pm);
 
 		i2o_report_status(KERN_INFO, "i2o_core", msg);
 
-		/* Release the preserved msg by resubmitting it as a NOP */
-		i2o_msg_nop(c, pm);
+		context = readl(&pmsg->u.s.tcntxt);
 
-		/* If reply to i2o_post_wait failed, return causes a timeout */
-		return -1;
-	}
+		/* Release the preserved msg */
+		i2o_msg_nop(c, pm);
+	} else
+		context = readl(&msg->u.s.tcntxt);
 
-	if (le32_to_cpu(msg->u.s.tcntxt) & 0x80000000)
-		return i2o_msg_post_wait_complete(c, m, msg);
+	if (context & 0x80000000)
+		return i2o_msg_post_wait_complete(c, m, msg, context);
 
-	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
+	if ((readl(&msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
 		struct work_struct *work;
 
 		pr_debug("%s: LCT notify received\n", c->name);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 4830b7759061..e69421e36ac5 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -104,7 +104,8 @@ static int i2o_block_remove(struct device *dev)
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
 	struct i2o_block_device *i2o_blk_dev = dev_get_drvdata(dev);
 
-	osm_info("Device removed %s\n", i2o_blk_dev->gd->disk_name);
+	osm_info("device removed (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
 
 	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0);
 
@@ -400,71 +401,62 @@ static void i2o_block_delayed_request_fn(void *delayed_request)
 };
 
 /**
- *	i2o_block_reply - Block OSM reply handler.
- *	@c: I2O controller from which the message arrives
- *	@m: message id of reply
- *	qmsg: the actuall I2O message reply
+ *	i2o_block_end_request - Post-processing of completed commands
+ *	@req: request which should be completed
+ *	@uptodate: 1 for success, 0 for I/O error, < 0 for specific error
+ *	@nr_bytes: number of bytes to complete
  *
- *	This function gets all the message replies.
+ *	Mark the request as complete. The lock must not be held when entering.
  *
  */
-static int i2o_block_reply(struct i2o_controller *c, u32 m,
-			   struct i2o_message *msg)
+static void i2o_block_end_request(struct request *req, int uptodate,
+				  int nr_bytes)
 {
-	struct i2o_block_request *ireq;
-	struct request *req;
-	struct i2o_block_device *dev;
-	struct request_queue *q;
-	u8 st;
+	struct i2o_block_request *ireq = req->special;
+	struct i2o_block_device *dev = ireq->i2o_blk_dev;
+	request_queue_t *q = dev->gd->queue;
 	unsigned long flags;
 
-	/* FAILed message */
-	if (unlikely(le32_to_cpu(msg->u.head[0]) & (1 << 13))) {
-		struct i2o_message *pmsg;
-		u32 pm;
-
-		/*
-		 * FAILed message from controller
-		 * We increment the error count and abort it
-		 *
-		 * In theory this will never happen.  The I2O block class
-		 * specification states that block devices never return
-		 * FAILs but instead use the REQ status field...but
-		 * better be on the safe side since no one really follows
-		 * the spec to the book :)
-		 */
-		pm = le32_to_cpu(msg->body[3]);
-		pmsg = i2o_msg_in_to_virt(c, pm);
+	if (end_that_request_chunk(req, uptodate, nr_bytes)) {
+		int leftover = (req->hard_nr_sectors << 9);
 
-		req = i2o_cntxt_list_get(c, le32_to_cpu(pmsg->u.s.tcntxt));
-		if (unlikely(!req)) {
-			osm_err("NULL reply received!\n");
-			return -1;
-		}
+		if (blk_pc_request(req))
+			leftover = req->data_len;
 
-		ireq = req->special;
-		dev = ireq->i2o_blk_dev;
-		q = dev->gd->queue;
+		if (end_io_error(uptodate))
+			end_that_request_chunk(req, 0, leftover);
+	}
 
-		req->errors++;
+	add_disk_randomness(req->rq_disk);
 
-		spin_lock_irqsave(q->queue_lock, flags);
+	spin_lock_irqsave(q->queue_lock, flags);
 
-		while (end_that_request_chunk(req, !req->errors,
-					      le32_to_cpu(pmsg->body[1]))) ;
-		end_that_request_last(req);
+	end_that_request_last(req);
+	dev->open_queue_depth--;
+	list_del(&ireq->queue);
 
-		dev->open_queue_depth--;
-		list_del(&ireq->queue);
-		blk_start_queue(q);
+	blk_start_queue(q);
 
-		spin_unlock_irqrestore(q->queue_lock, flags);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 
-		/* Now flush the message by making it a NOP */
-		i2o_msg_nop(c, pm);
+	i2o_block_sglist_free(ireq);
+	i2o_block_request_free(ireq);
+};
 
-		return -1;
-	}
+/**
+ *	i2o_block_reply - Block OSM reply handler.
+ *	@c: I2O controller from which the message arrives
+ *	@m: message id of reply
+ *	qmsg: the actuall I2O message reply
+ *
+ *	This function gets all the message replies.
+ *
+ */
+static int i2o_block_reply(struct i2o_controller *c, u32 m,
+			   struct i2o_message *msg)
+{
+	struct request *req;
+	int uptodate = 1;
 
 	req = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
 	if (unlikely(!req)) {
@@ -472,61 +464,13 @@ static int i2o_block_reply(struct i2o_controller *c, u32 m,
 		return -1;
 	}
 
-	ireq = req->special;
-	dev = ireq->i2o_blk_dev;
-	q = dev->gd->queue;
-
-	if (unlikely(!dev->i2o_dev)) {
-		/*
-		 * This is HACK, but Intel Integrated RAID allows user
-		 * to delete a volume that is claimed, locked, and in use
-		 * by the OS. We have to check for a reply from a
-		 * non-existent device and flag it as an error or the system
-		 * goes kaput...
-		 */
-		req->errors++;
-		osm_warn("Data transfer to deleted device!\n");
-		spin_lock_irqsave(q->queue_lock, flags);
-		while (end_that_request_chunk
-		       (req, !req->errors, le32_to_cpu(msg->body[1]))) ;
-		end_that_request_last(req);
-
-		dev->open_queue_depth--;
-		list_del(&ireq->queue);
-		blk_start_queue(q);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-		return -1;
-	}
-
 	/*
 	 *      Lets see what is cooking. We stuffed the
 	 *      request in the context.
 	 */
 
-	st = le32_to_cpu(msg->body[0]) >> 24;
-
-	if (st != 0) {
-		int err;
-		char *bsa_errors[] = {
-			"Success",
-			"Media Error",
-			"Failure communicating to device",
-			"Device Failure",
-			"Device is not ready",
-			"Media not present",
-			"Media is locked by another user",
-			"Media has failed",
-			"Failure communicating to device",
-			"Device bus failure",
-			"Device is locked by another user",
-			"Device is write protected",
-			"Device has reset",
-			"Volume has changed, waiting for acknowledgement"
-		};
-
-		err = le32_to_cpu(msg->body[0]) & 0xffff;
-
+	if ((le32_to_cpu(msg->body[0]) >> 24) != 0) {
+		u32 status = le32_to_cpu(msg->body[0]);
 		/*
 		 *      Device not ready means two things. One is that the
 		 *      the thing went offline (but not a removal media)
@@ -539,40 +483,23 @@ static int i2o_block_reply(struct i2o_controller *c, u32 m,
 		 *      Don't stick a supertrak100 into cache aggressive modes
 		 */
 
-		osm_err("block-osm: /dev/%s error: %s", dev->gd->disk_name,
-			bsa_errors[le32_to_cpu(msg->body[0]) & 0xffff]);
-		if (le32_to_cpu(msg->body[0]) & 0x00ff0000)
-			printk(KERN_ERR " - DDM attempted %d retries",
-			       (le32_to_cpu(msg->body[0]) >> 16) & 0x00ff);
-		printk(KERN_ERR ".\n");
-		req->errors++;
-	} else
-		req->errors = 0;
-
-	if (!end_that_request_chunk
-	    (req, !req->errors, le32_to_cpu(msg->body[1]))) {
-		add_disk_randomness(req->rq_disk);
-		spin_lock_irqsave(q->queue_lock, flags);
+		osm_err("%03x error status: %02x, detailed status: %04x\n",
+			(le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
+			status >> 24, status & 0xffff);
 
-		end_that_request_last(req);
+		req->errors++;
 
-		dev->open_queue_depth--;
-		list_del(&ireq->queue);
-		blk_start_queue(q);
+		uptodate = 0;
+	}
 
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
-		i2o_block_sglist_free(ireq);
-		i2o_block_request_free(ireq);
-	} else
-		osm_err("still remaining chunks\n");
+	i2o_block_end_request(req, uptodate, le32_to_cpu(msg->body[1]));
 
 	return 1;
 };
 
 static void i2o_block_event(struct i2o_event *evt)
 {
-	osm_info("block-osm: event received\n");
+	osm_info("event received\n");
 	kfree(evt);
 };
 
@@ -875,9 +802,7 @@ static int i2o_block_transfer(struct request *req)
 		sg++;
 	}
 
-	writel(I2O_MESSAGE_SIZE
-	       (((unsigned long)mptr -
-		 (unsigned long)&msg->u.head[0]) >> 2) | SGL_OFFSET_8,
+	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | SGL_OFFSET_8,
 	       &msg->u.head[0]);
 
 	list_add_tail(&ireq->queue, &dev->open_queue);
@@ -1048,7 +973,6 @@ static int i2o_block_probe(struct device *dev)
 	int rc;
 	u64 size;
 	u32 blocksize;
-	u16 power;
 	u32 flags, status;
 	int segments;
 
@@ -1058,8 +982,6 @@ static int i2o_block_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	osm_info("New device detected (TID: %03x)\n", i2o_dev->lct_data.tid);
-
 	if (i2o_device_claim(i2o_dev)) {
 		osm_warn("Unable to claim device. Installation aborted\n");
 		rc = -EFAULT;
@@ -1111,15 +1033,21 @@ static int i2o_block_probe(struct device *dev)
 	 *      Ask for the current media data. If that isn't supported
 	 *      then we ask for the device capacity data
 	 */
-	if (i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) != 0
-	    || i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) != 0) {
-		i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4);
-		i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8);
-	}
-	osm_debug("blocksize = %d\n", blocksize);
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8))
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
+			osm_warn("could not get size of %s\n", gd->disk_name);
+			size = 0;
+		}
 
-	if (i2o_parm_field_get(i2o_dev, 0x0000, 2, &power, 2))
-		power = 0;
+	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4))
+		if (!i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
+			osm_warn("unable to get blocksize of %s\n",
+				 gd->disk_name);
+			blocksize = 0;
+		}
+
+	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &i2o_blk_dev->power, 2))
+		i2o_blk_dev->power = 0;
 	i2o_parm_field_get(i2o_dev, 0x0000, 5, &flags, 4);
 	i2o_parm_field_get(i2o_dev, 0x0000, 6, &status, 4);
 
@@ -1131,6 +1059,9 @@ static int i2o_block_probe(struct device *dev)
 
 	unit++;
 
+	osm_info("device added (TID: %03x): %s\n", i2o_dev->lct_data.tid,
+		 i2o_blk_dev->gd->disk_name);
+
 	return 0;
 
       claim_release:
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
index ddd9a15679c0..712111ffa638 100644
--- a/drivers/message/i2o/i2o_block.h
+++ b/drivers/message/i2o/i2o_block.h
@@ -74,7 +74,7 @@ struct i2o_block_device {
 	int rcache;			/* read cache flags */
 	int wcache;			/* write cache flags */
 	int flags;
-	int power;			/* power state */
+	u16 power;			/* power state */
 	int media_change_flag;		/* media changed flag */
 };
 
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 46d373287a30..383e89a5c9f0 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -80,13 +80,123 @@ struct i2o_cfg_info {
 static struct i2o_cfg_info *open_files = NULL;
 static ulong i2o_cfg_info_id = 0;
 
-/*
- *	Each of these describes an i2o message handler. They are
- *	multiplexed by the i2o_core code
+/**
+ *	i2o_config_read_hrt - Returns the HRT of the controller
+ *	@kob: kernel object handle
+ *	@buf: buffer into which the HRT should be copied
+ *	@off: file offset
+ *	@count: number of bytes to read
+ *
+ *	Put @count bytes starting at @off into @buf from the HRT of the I2O
+ *	controller corresponding to @kobj.
+ *
+ *	Returns number of bytes copied into buffer.
+ */
+static ssize_t i2o_config_read_hrt(struct kobject *kobj, char *buf,
+				   loff_t offset, size_t count)
+{
+	struct i2o_controller *c = to_i2o_controller(container_of(kobj,
+								  struct device,
+								  kobj));
+	i2o_hrt *hrt = c->hrt.virt;
+
+	u32 size = (hrt->num_entries * hrt->entry_len + 2) * 4;
+
+	if(offset > size)
+		return 0;
+
+	if(offset + count > size)
+		count = size - offset;
+
+	memcpy(buf, (u8 *) hrt + offset, count);
+
+	return count;
+};
+
+/**
+ *	i2o_config_read_lct - Returns the LCT of the controller
+ *	@kob: kernel object handle
+ *	@buf: buffer into which the LCT should be copied
+ *	@off: file offset
+ *	@count: number of bytes to read
+ *
+ *	Put @count bytes starting at @off into @buf from the LCT of the I2O
+ *	controller corresponding to @kobj.
+ *
+ *	Returns number of bytes copied into buffer.
+ */
+static ssize_t i2o_config_read_lct(struct kobject *kobj, char *buf,
+				   loff_t offset, size_t count)
+{
+	struct i2o_controller *c = to_i2o_controller(container_of(kobj,
+								  struct device,
+								  kobj));
+	u32 size = c->lct->table_size * 4;
+
+	if(offset > size)
+		return 0;
+
+	if(offset + count > size)
+		count = size - offset;
+
+	memcpy(buf, (u8 *) c->lct + offset, count);
+
+	return count;
+};
+
+/* attribute for HRT in sysfs */
+static struct bin_attribute i2o_config_hrt_attr = {
+	.attr = {
+		.name = "hrt",
+		.mode = S_IRUGO,
+		.owner = THIS_MODULE
+	},
+	.size = 0,
+	.read = i2o_config_read_hrt
+};
+
+/* attribute for LCT in sysfs */
+static struct bin_attribute i2o_config_lct_attr = {
+	.attr = {
+		.name = "lct",
+		.mode = S_IRUGO,
+		.owner = THIS_MODULE
+	},
+	.size = 0,
+	.read = i2o_config_read_lct
+};
+
+/**
+ *	i2o_config_notify_controller_add - Notify of added controller
+ *	@c: the controller which was added
+ *
+ *	If a I2O controller is added, we catch the notification to add sysfs
+ *	entries.
+ */
+static void i2o_config_notify_controller_add(struct i2o_controller *c)
+{
+	sysfs_create_bin_file(&(c->device.kobj), &i2o_config_hrt_attr);
+	sysfs_create_bin_file(&(c->device.kobj), &i2o_config_lct_attr);
+};
+
+/**
+ *	i2o_config_notify_controller_remove - Notify of removed controller
+ *	@c: the controller which was removed
+ *
+ *	If a I2O controller is removed, we catch the notification to remove the
+ *	sysfs entries.
  */
+static void i2o_config_notify_controller_remove(struct i2o_controller *c)
+{
+	sysfs_remove_bin_file(&c->device.kobj, &i2o_config_lct_attr);
+	sysfs_remove_bin_file(&c->device.kobj, &i2o_config_hrt_attr);
+};
 
+/* Config OSM driver struct */
 static struct i2o_driver i2o_config_driver = {
-	.name = OSM_NAME
+	.name = OSM_NAME,
+	.notify_controller_add = i2o_config_notify_controller_add,
+	.notify_controller_remove = i2o_config_notify_controller_remove
 };
 
 static int i2o_cfg_getiops(unsigned long arg)
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index af40f1c1ec77..812c29ec86d3 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -40,6 +40,7 @@
  *	Fix the resource management problems.
  */
 
+#define DEBUG 1
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -179,6 +180,8 @@ static int i2o_scsi_remove(struct device *dev)
 	struct i2o_scsi_host *i2o_shost;
 	struct scsi_device *scsi_dev;
 
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
 	i2o_shost = i2o_scsi_get_host(c);
 
 	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
@@ -262,8 +265,8 @@ static int i2o_scsi_probe(struct device *dev)
 		return -EFAULT;
 	}
 
-	osm_debug("added new SCSI device %03x (cannel: %d, id: %d, lun: %d)\n",
-		  i2o_dev->lct_data.tid, channel, id, (unsigned int)lun);
+	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
+		 i2o_dev->lct_data.tid, channel, id, (unsigned int)lun);
 
 	return 0;
 };
@@ -439,8 +442,6 @@ static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
 
 	cmd->result = DID_OK << 16 | ds;
 
-	cmd->scsi_done(cmd);
-
 	dev = &c->pdev->dev;
 	if (cmd->use_sg)
 		dma_unmap_sg(dev, (struct scatterlist *)cmd->buffer,
@@ -449,6 +450,8 @@ static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
 		dma_unmap_single(dev, (dma_addr_t) ((long)cmd->SCp.ptr),
 				 cmd->request_bufflen, cmd->sc_data_direction);
 
+	cmd->scsi_done(cmd);
+
 	return 1;
 };
 
@@ -502,7 +505,7 @@ static void i2o_scsi_notify_controller_remove(struct i2o_controller *c)
 
 	scsi_remove_host(i2o_shost->scsi_host);
 	scsi_host_put(i2o_shost->scsi_host);
-	pr_info("I2O SCSI host removed\n");
+	osm_debug("I2O SCSI host removed\n");
 };
 
 /* SCSI OSM driver struct */
@@ -545,7 +548,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	u32 scsi_flags, sg_flags;
 	u32 __iomem *mptr;
 	u32 __iomem *lenptr;
-	u32 len, reqlen;
+	u32 len;
 	int i;
 
 	/*
@@ -580,12 +583,12 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	if (m == I2O_QUEUE_EMPTY)
 		return SCSI_MLQUEUE_HOST_BUSY;
 
+	mptr = &msg->body[0];
+
 	/*
 	 *      Put together a scsi execscb message
 	 */
 
-	len = SCpnt->request_bufflen;
-
 	switch (SCpnt->sc_data_direction) {
 	case PCI_DMA_NONE:
 		scsi_flags = 0x00000000;	// DATA NO XFER
@@ -637,17 +640,13 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 */
 
 	/* Direction, disconnect ok, tag, CDBLen */
-	writel(scsi_flags | 0x20200000 | SCpnt->cmd_len, &msg->body[0]);
-
-	mptr = &msg->body[1];
+	writel(scsi_flags | 0x20200000 | SCpnt->cmd_len, mptr ++);
 
 	/* Write SCSI command into the message - always 16 byte block */
 	memcpy_toio(mptr, SCpnt->cmnd, 16);
 	mptr += 4;
 	lenptr = mptr++;	/* Remember me - fill in when we know */
 
-	reqlen = 12;		// SINGLE SGE
-
 	/* Now fill in the SGList and command */
 	if (SCpnt->use_sg) {
 		struct scatterlist *sg;
@@ -671,7 +670,6 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 			sg++;
 		}
 
-		reqlen = mptr - &msg->u.head[0];
 		writel(len, lenptr);
 	} else {
 		len = SCpnt->request_bufflen;
@@ -691,12 +689,11 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 			sg_flags |= 0xC0000000;
 			writel(sg_flags | SCpnt->request_bufflen, mptr++);
 			writel(dma_addr, mptr++);
-		} else
-			reqlen = 9;
+		}
 	}
 
 	/* Stick the headers on */
-	writel(reqlen << 16 | SGL_OFFSET_10, &msg->u.head[0]);
+	writel((mptr - &msg->u.head[0]) << 16 | SGL_OFFSET_10, &msg->u.head[0]);
 
 	/* Queue the message */
 	i2o_msg_post(c, m);
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 50c8cedf7a2d..62b0d8bed186 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -68,7 +68,7 @@ extern void i2o_device_exit(void);
  */
 void i2o_msg_nop(struct i2o_controller *c, u32 m)
 {
-	struct i2o_message __iomem *msg = c->in_queue.virt + m;
+	struct i2o_message __iomem *msg = i2o_msg_in_to_virt(c, m);
 
 	writel(THREE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
 	writel(I2O_CMD_UTIL_NOP << 24 | HOST_TID << 12 | ADAPTER_TID,
@@ -452,8 +452,6 @@ static int i2o_iop_clear(struct i2o_controller *c)
 	/* Enable all IOPs */
 	i2o_iop_enable_all();
 
-	i2o_status_get(c);
-
 	return rc;
 }
 
@@ -591,12 +589,11 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 	if (m == I2O_QUEUE_EMPTY)
 		return -ETIMEDOUT;
 
-	writel(EIGHT_WORD_MSG_SIZE | TRL_OFFSET_6, &msg->u.head[0]);
+	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
 	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
 	       &msg->u.head[1]);
 	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0x0106, &msg->u.s.tcntxt);	/* FIXME: why 0x0106, maybe in
-						   Spec? */
+	writel(0x00000000, &msg->u.s.tcntxt);
 	writel(PAGE_SIZE, &msg->body[0]);
 	writel(MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);	/* Outbound msg frame
 								   size in words and Initcode */
@@ -891,8 +888,12 @@ void i2o_iop_remove(struct i2o_controller *c)
 	list_for_each_entry_safe(dev, tmp, &c->devices, list)
 	    i2o_device_remove(dev);
 
+	device_del(&c->device);
+
 	/* Ask the IOP to switch to RESET state */
 	i2o_iop_reset(c);
+
+	put_device(&c->device);
 }
 
 /**
@@ -971,8 +972,10 @@ static int i2o_systab_build(void)
 		systab->iops[count].frame_size = sb->inbound_frame_size;
 		systab->iops[count].last_changed = change_ind;
 		systab->iops[count].iop_capabilities = sb->iop_capabilities;
-		systab->iops[count].inbound_low = i2o_ptr_low(c->post_port);
-		systab->iops[count].inbound_high = i2o_ptr_high(c->post_port);
+		systab->iops[count].inbound_low =
+		    i2o_dma_low(c->base.phys + I2O_IN_PORT);
+		systab->iops[count].inbound_high =
+		    i2o_dma_high(c->base.phys + I2O_IN_PORT);
 
 		count++;
 	}
@@ -1109,6 +1112,30 @@ static int i2o_hrt_get(struct i2o_controller *c)
 	return -EBUSY;
 }
 
+/**
+ *	i2o_iop_free - Free the i2o_controller struct
+ *	@c: I2O controller to free
+ */
+void i2o_iop_free(struct i2o_controller *c)
+{
+	kfree(c);
+};
+
+
+/**
+ *	i2o_iop_release - release the memory for a I2O controller
+ *	@dev: I2O controller which should be released
+ *
+ *	Release the allocated memory. This function is called if refcount of
+ *	device reaches 0 automatically.
+ */
+static void i2o_iop_release(struct device *dev)
+{
+	struct i2o_controller *c = to_i2o_controller(dev);
+
+	i2o_iop_free(c);
+};
+
 /**
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
@@ -1137,6 +1164,10 @@ struct i2o_controller *i2o_iop_alloc(void)
 	c->unit = unit++;
 	sprintf(c->name, "iop%d", c->unit);
 
+	device_initialize(&c->device);
+	c->device.release = &i2o_iop_release;
+	snprintf(c->device.bus_id, BUS_ID_SIZE, "iop%d", c->unit);
+
 #if BITS_PER_LONG == 64
 	spin_lock_init(&c->context_list_lock);
 	atomic_set(&c->context_list_counter, 0);
@@ -1146,15 +1177,6 @@ struct i2o_controller *i2o_iop_alloc(void)
 	return c;
 };
 
-/**
- *	i2o_iop_free - Free the i2o_controller struct
- *	@c: I2O controller to free
- */
-void i2o_iop_free(struct i2o_controller *c)
-{
-	kfree(c);
-};
-
 /**
  *	i2o_iop_add - Initialize the I2O controller and add him to the I2O core
  *	@c: controller
@@ -1168,6 +1190,11 @@ int i2o_iop_add(struct i2o_controller *c)
 {
 	int rc;
 
+	if((rc = device_add(&c->device))) {
+		printk(KERN_ERR "%s: could not register controller\n", c->name);
+		goto iop_reset;
+	}
+
 	printk(KERN_INFO "%s: Activating I2O controller...\n", c->name);
 	printk(KERN_INFO "%s: This may take a few minutes if there are many "
 	       "devices\n", c->name);
@@ -1175,30 +1202,23 @@ int i2o_iop_add(struct i2o_controller *c)
 	if ((rc = i2o_iop_activate(c))) {
 		printk(KERN_ERR "%s: could not activate controller\n",
 		       c->name);
-		i2o_iop_reset(c);
-		return rc;
+		goto iop_reset;
 	}
 
 	pr_debug("%s: building sys table...\n", c->name);
 
-	if ((rc = i2o_systab_build())) {
-		i2o_iop_reset(c);
-		return rc;
-	}
+	if ((rc = i2o_systab_build()))
+		goto iop_reset;
 
 	pr_debug("%s: online controller...\n", c->name);
 
-	if ((rc = i2o_iop_online(c))) {
-		i2o_iop_reset(c);
-		return rc;
-	}
+	if ((rc = i2o_iop_online(c)))
+		goto iop_reset;
 
 	pr_debug("%s: getting LCT...\n", c->name);
 
-	if ((rc = i2o_exec_lct_get(c))) {
-		i2o_iop_reset(c);
-		return rc;
-	}
+	if ((rc = i2o_exec_lct_get(c)))
+		goto iop_reset;
 
 	list_add(&c->list, &i2o_controllers);
 
@@ -1207,6 +1227,11 @@ int i2o_iop_add(struct i2o_controller *c)
 	printk(KERN_INFO "%s: Controller added\n", c->name);
 
 	return 0;
+
+iop_reset:
+	i2o_iop_reset(c);
+
+	return rc;
 };
 
 /**
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 579a8b7a2120..f33fd81f77a4 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -38,8 +38,7 @@ extern void i2o_iop_free(struct i2o_controller *);
 extern int i2o_iop_add(struct i2o_controller *);
 extern void i2o_iop_remove(struct i2o_controller *);
 
-extern int i2o_driver_dispatch(struct i2o_controller *, u32,
-			       struct i2o_message *);
+extern int i2o_driver_dispatch(struct i2o_controller *, u32);
 
 /* PCI device id table for all I2O controllers */
 static struct pci_device_id __devinitdata i2o_pci_ids[] = {
@@ -89,8 +88,7 @@ static void i2o_pci_free(struct i2o_controller *c)
 
 	i2o_dma_free(dev, &c->out_queue);
 	i2o_dma_free(dev, &c->status_block);
-	if (c->lct)
-		kfree(c->lct);
+	kfree(c->lct);
 	i2o_dma_free(dev, &c->dlct);
 	i2o_dma_free(dev, &c->hrt);
 	i2o_dma_free(dev, &c->status);
@@ -187,9 +185,9 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 	} else
 		c->in_queue = c->base;
 
-	c->irq_mask = c->base.virt + 0x34;
-	c->post_port = c->base.virt + 0x40;
-	c->reply_port = c->base.virt + 0x44;
+	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
+	c->in_port = c->base.virt + I2O_IN_PORT;
+	c->out_port = c->base.virt + I2O_OUT_PORT;
 
 	if (i2o_dma_alloc(dev, &c->status, 8, GFP_KERNEL)) {
 		i2o_pci_free(c);
@@ -235,49 +233,34 @@ static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id, struct pt_regs *r)
 {
 	struct i2o_controller *c = dev_id;
 	struct device *dev = &c->pdev->dev;
-	struct i2o_message *m;
-	u32 mv;
+	u32 mv = readl(c->out_port);
 
 	/*
 	 * Old 960 steppings had a bug in the I2O unit that caused
 	 * the queue to appear empty when it wasn't.
 	 */
-	mv = I2O_REPLY_READ32(c);
 	if (mv == I2O_QUEUE_EMPTY) {
-		mv = I2O_REPLY_READ32(c);
-		if (unlikely(mv == I2O_QUEUE_EMPTY)) {
+		mv = readl(c->out_port);
+		if (unlikely(mv == I2O_QUEUE_EMPTY))
 			return IRQ_NONE;
-		} else
+		else
 			pr_debug("%s: 960 bug detected\n", c->name);
 	}
 
 	while (mv != I2O_QUEUE_EMPTY) {
-		/*
-		 * Map the message from the page frame map to kernel virtual.
-		 * Because bus_to_virt is deprecated, we have calculate the
-		 * location by ourself!
-		 */
-		m = i2o_msg_out_to_virt(c, mv);
-
-		/*
-		 *      Ensure this message is seen coherently but cachably by
-		 *      the processor
-		 */
-		dma_sync_single_for_cpu(dev, mv, MSG_FRAME_SIZE * 4,
-					PCI_DMA_FROMDEVICE);
-
 		/* dispatch it */
-		if (i2o_driver_dispatch(c, mv, m))
+		if (i2o_driver_dispatch(c, mv))
 			/* flush it if result != 0 */
 			i2o_flush_reply(c, mv);
 
 		/*
 		 * That 960 bug again...
 		 */
-		mv = I2O_REPLY_READ32(c);
+		mv = readl(c->out_port);
 		if (mv == I2O_QUEUE_EMPTY)
-			mv = I2O_REPLY_READ32(c);
+			mv = readl(c->out_port);
 	}
+
 	return IRQ_HANDLED;
 }
 
@@ -294,7 +277,9 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
 	struct pci_dev *pdev = c->pdev;
 	int rc;
 
-	I2O_IRQ_WRITE32(c, 0xffffffff);
+	wmb();
+	writel(0xffffffff, c->irq_mask);
+	wmb();
 
 	if (pdev->irq) {
 		rc = request_irq(pdev->irq, i2o_pci_interrupt, SA_SHIRQ,
@@ -306,7 +291,8 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
 		}
 	}
 
-	I2O_IRQ_WRITE32(c, 0x00000000);
+	writel(0x00000000, c->irq_mask);
+	wmb();
 
 	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
 
@@ -321,7 +307,9 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
  */
 static void i2o_pci_irq_disable(struct i2o_controller *c)
 {
-	I2O_IRQ_WRITE32(c, 0xffffffff);
+	wmb();
+	writel(0xffffffff, c->irq_mask);
+	wmb();
 
 	if (c->pdev->irq > 0)
 		free_irq(c->pdev->irq, c);
@@ -379,7 +367,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 		       pci_name(pdev));
 
 	c->pdev = pdev;
-	c->device = pdev->dev;
+	c->device.parent = get_device(&pdev->dev);
 
 	/* Cards that fall apart if you hit them with large I/O loads... */
 	if (pdev->vendor == PCI_VENDOR_ID_NCR && pdev->device == 0x0630) {
@@ -428,6 +416,8 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	if (i960)
 		pci_write_config_word(i960, 0x42, 0x03ff);
 
+	get_device(&c->device);
+
 	return 0;
 
       uninstall:
@@ -438,6 +428,7 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 
       free_controller:
 	i2o_iop_free(c);
+	put_device(c->device.parent);
 
       disable:
 	pci_disable_device(pdev);
@@ -461,15 +452,17 @@ static void __devexit i2o_pci_remove(struct pci_dev *pdev)
 	i2o_pci_irq_disable(c);
 	i2o_pci_free(c);
 
+	pci_disable_device(pdev);
+
 	printk(KERN_INFO "%s: Controller removed.\n", c->name);
 
-	i2o_iop_free(c);
-	pci_disable_device(pdev);
+	put_device(c->device.parent);
+	put_device(&c->device);
 };
 
 /* PCI driver for I2O controller */
 static struct pci_driver i2o_pci_driver = {
-	.name = "I2O controller",
+	.name = "PCI_I2O",
 	.id_table = i2o_pci_ids,
 	.probe = i2o_pci_probe,
 	.remove = __devexit_p(i2o_pci_remove),
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 40e45a83d3fb..e8cd11290010 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -153,12 +153,10 @@ struct i2o_controller {
 	unsigned int promise:1;		/* Promise controller */
 
 	struct list_head devices;	/* list of I2O devices */
-
-	struct notifier_block *event_notifer;	/* Events */
-	atomic_t users;
 	struct list_head list;	/* Controller list */
-	void __iomem *post_port;	/* Inbout port address */
-	void __iomem *reply_port;	/* Outbound port address */
+
+	void __iomem *in_port;	/* Inbout port address */
+	void __iomem *out_port;	/* Outbound port address */
 	void __iomem *irq_mask;		/* Interrupt register address */
 
 	/* Dynamic LCT related data */
@@ -182,9 +180,6 @@ struct i2o_controller {
 	struct resource io_resource;	/* I/O resource allocated to the IOP */
 	struct resource mem_resource;	/* Mem resource allocated to the IOP */
 
-	struct proc_dir_entry *proc_entry;	/* /proc dir */
-
-	struct list_head bus_list;	/* list of busses on IOP */
 	struct device device;
 	struct i2o_device *exec;	/* Executive */
 #if BITS_PER_LONG == 64
@@ -380,49 +375,10 @@ extern int i2o_device_claim_release(struct i2o_device *);
 /* Exec OSM functions */
 extern int i2o_exec_lct_get(struct i2o_controller *);
 
-/* device to i2o_device and driver to i2o_driver convertion functions */
+/* device / driver conversion functions */
 #define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
 #define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
-
-/*
- *	Messenger inlines
- */
-static inline u32 I2O_POST_READ32(struct i2o_controller *c)
-{
-	rmb();
-	return readl(c->post_port);
-};
-
-static inline void I2O_POST_WRITE32(struct i2o_controller *c, u32 val)
-{
-	wmb();
-	writel(val, c->post_port);
-};
-
-static inline u32 I2O_REPLY_READ32(struct i2o_controller *c)
-{
-	rmb();
-	return readl(c->reply_port);
-};
-
-static inline void I2O_REPLY_WRITE32(struct i2o_controller *c, u32 val)
-{
-	wmb();
-	writel(val, c->reply_port);
-};
-
-static inline u32 I2O_IRQ_READ32(struct i2o_controller *c)
-{
-	rmb();
-	return readl(c->irq_mask);
-};
-
-static inline void I2O_IRQ_WRITE32(struct i2o_controller *c, u32 val)
-{
-	wmb();
-	writel(val, c->irq_mask);
-	wmb();
-};
+#define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
 
 /**
  *	i2o_msg_get - obtain an I2O message from the IOP
@@ -440,10 +396,12 @@ static inline void I2O_IRQ_WRITE32(struct i2o_controller *c, u32 val)
 static inline u32 i2o_msg_get(struct i2o_controller *c,
 			      struct i2o_message __iomem **msg)
 {
-	u32 m;
+	u32 m = readl(c->in_port);
 
-	if ((m = I2O_POST_READ32(c)) != I2O_QUEUE_EMPTY)
+	if (m != I2O_QUEUE_EMPTY) {
 		*msg = c->in_queue.virt + m;
+		rmb();
+	}
 
 	return m;
 };
@@ -457,7 +415,8 @@ static inline u32 i2o_msg_get(struct i2o_controller *c,
  */
 static inline void i2o_msg_post(struct i2o_controller *c, u32 m)
 {
-	I2O_POST_WRITE32(c, m);
+	wmb();
+	writel(m, c->in_port);
 };
 
 /**
@@ -486,12 +445,10 @@ static inline int i2o_msg_post_wait(struct i2o_controller *c, u32 m,
  *	The I2O controller must be informed that the reply message is not needed
  *	anymore. If you forget to flush the reply, the message frame can't be
  *	used by the controller anymore and is therefore lost.
- *
- *	FIXME: is there a timeout after which the controller reuse the message?
  */
 static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
 {
-	I2O_REPLY_WRITE32(c, m);
+	writel(m, c->out_port);
 };
 
 /**
@@ -505,8 +462,9 @@ static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
  *	work for sender side messages as they are ioremap objects
  *	provided by the I2O controller.
  */
-static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
-						      u32 m)
+static inline struct i2o_message __iomem *i2o_msg_out_to_virt(struct
+							      i2o_controller *c,
+							      u32 m)
 {
 	BUG_ON(m < c->out_queue.phys
 	       || m >= c->out_queue.phys + c->out_queue.len);
@@ -917,7 +875,7 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define I2OVER15	0x0001
 #define I2OVER20	0x0002
 
-/* Default is 1.5, FIXME: Need support for both 1.5 and 2.0 */
+/* Default is 1.5 */
 #define I2OVERSION	I2OVER15
 
 #define SGL_OFFSET_0    I2OVERSION
-- 
cgit v1.2.3-59-g8ed1b


From f10378fff658f61307496e0ae00095041725cf07 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:16 -0700
Subject: [PATCH] I2O: new sysfs attributes and Adaptec specific block device
 access and 64-bit DMA support

Changes:
 - Added Bus-OSM which could be used by user space programs to reset a
   channel on the controller
 - Make ioctl's in Config-OSM obsolete in prefer for sysfs attributes and
   move those to its own file
 - Added sysfs attribute for firmware read and write access for I2O
   controllers
 - Added special handling of firmware read and write access for Adaptec
   controllers
 - Added vendor id and product id as sysfs-attribute to Executive classes
 - Added automatic notification of LCT change handling to Exec-OSM
 - Added flushing function to Block-OSM for later barrier implementation
 - Use PRIVATE messages for Block access on Adaptec controllers, which are
   faster then BLOCK class access
 - Cleaned up support for Promise controller
 - New messages are now detected using the IRQ status register as
   suggested by the I2O spec
 - Added i2o_dma_high() and i2o_dma_low() functions
 - Added facility for SG tablesize calculation when using 32-bit and
   64-bit DMA addresses
 - Added i2o_dma_map_single() and i2o_dma_map_sg() which could build the
   SG list for 32-bit as well as 64-bit DMA addresses

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/Kconfig      |  18 ++
 drivers/message/i2o/Makefile     |   3 +
 drivers/message/i2o/bus-osm.c    | 164 +++++++++++
 drivers/message/i2o/config-osm.c | 579 +++++++++++++++++++++++++++++++++++++++
 drivers/message/i2o/driver.c     |  12 +-
 drivers/message/i2o/exec-osm.c   |  74 ++++-
 drivers/message/i2o/i2o_block.c  | 277 ++++++++++++-------
 drivers/message/i2o/i2o_block.h  |   4 +-
 drivers/message/i2o/i2o_config.c | 156 +----------
 drivers/message/i2o/i2o_proc.c   |   4 +-
 drivers/message/i2o/i2o_scsi.c   |  30 +-
 drivers/message/i2o/iop.c        | 263 ++++++++----------
 drivers/message/i2o/pci.c        |  67 ++---
 include/linux/i2o-dev.h          |   6 +-
 include/linux/i2o.h              | 321 ++++++++++++++++++----
 15 files changed, 1446 insertions(+), 532 deletions(-)
 create mode 100644 drivers/message/i2o/bus-osm.c
 create mode 100644 drivers/message/i2o/config-osm.c

(limited to 'include/linux')

diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index 8d132b0d6b12..ce278e060aca 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -35,6 +35,24 @@ config I2O_CONFIG
 	  To compile this support as a module, choose M here: the
 	  module will be called i2o_config.
 
+config I2O_CONFIG_OLD_IOCTL
+	bool "Enable ioctls (OBSOLETE)"
+	depends on I2O_CONFIG
+	default y
+	---help---
+	  Enables old ioctls.
+
+config I2O_BUS
+	tristate "I2O Bus Adapter OSM"
+	depends on I2O
+	---help---
+	  Include support for the I2O Bus Adapter OSM. The Bus Adapter OSM
+	  provides access to the busses on the I2O controller. The main purpose
+	  is to rescan the bus to find new devices.
+
+	  To compile this support as a module, choose M here: the
+	  module will be called i2o_bus.
+
 config I2O_BLOCK
 	tristate "I2O Block OSM"
 	depends on I2O
diff --git a/drivers/message/i2o/Makefile b/drivers/message/i2o/Makefile
index aabc6cdc3fce..2c2e39aa1efa 100644
--- a/drivers/message/i2o/Makefile
+++ b/drivers/message/i2o/Makefile
@@ -6,8 +6,11 @@
 #
 
 i2o_core-y		+= iop.o driver.o device.o debug.o pci.o exec-osm.o
+i2o_bus-y		+= bus-osm.o
+i2o_config-y		+= config-osm.o
 obj-$(CONFIG_I2O)	+= i2o_core.o
 obj-$(CONFIG_I2O_CONFIG)+= i2o_config.o
+obj-$(CONFIG_I2O_BUS)	+= i2o_bus.o
 obj-$(CONFIG_I2O_BLOCK)	+= i2o_block.o
 obj-$(CONFIG_I2O_SCSI)	+= i2o_scsi.o
 obj-$(CONFIG_I2O_PROC)	+= i2o_proc.o
diff --git a/drivers/message/i2o/bus-osm.c b/drivers/message/i2o/bus-osm.c
new file mode 100644
index 000000000000..d43c35894ae9
--- /dev/null
+++ b/drivers/message/i2o/bus-osm.c
@@ -0,0 +1,164 @@
+/*
+ *	Bus Adapter OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include <linux/i2o.h>
+
+#define OSM_NAME	"bus-osm"
+#define OSM_VERSION	"$Rev$"
+#define OSM_DESCRIPTION	"I2O Bus Adapter OSM"
+
+static struct i2o_driver i2o_bus_driver;
+
+/* Bus OSM class handling definition */
+static struct i2o_class_id i2o_bus_class_id[] = {
+	{I2O_CLASS_BUS_ADAPTER},
+	{I2O_CLASS_END}
+};
+
+/**
+ *	i2o_bus_scan - Scan the bus for new devices
+ *	@dev: I2O device of the bus, which should be scanned
+ *
+ *	Scans the bus dev for new / removed devices. After the scan a new LCT
+ *	will be fetched automatically.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int i2o_bus_scan(struct i2o_device *dev)
+{
+	struct i2o_message __iomem *msg;
+	u32 m;
+
+	m = i2o_msg_get_wait(dev->iop, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	if (m == I2O_QUEUE_EMPTY)
+		return -ETIMEDOUT;
+
+	writel(FIVE_WORD_MSG_SIZE | SGL_OFFSET_0, &msg->u.head[0]);
+	writel(I2O_CMD_BUS_SCAN << 24 | HOST_TID << 12 | dev->lct_data.tid,
+	       &msg->u.head[1]);
+
+	return i2o_msg_post_wait(dev->iop, m, 60);
+};
+
+/**
+ *	i2o_bus_store_scan - Scan the I2O Bus Adapter
+ *	@d: device which should be scanned
+ *
+ *	Returns count.
+ */
+static ssize_t i2o_bus_store_scan(struct device *d, const char *buf,
+				  size_t count)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(d);
+	int rc;
+
+	if ((rc = i2o_bus_scan(i2o_dev)))
+		osm_warn("bus scan failed %d\n", rc);
+
+	return count;
+}
+
+/* Bus Adapter OSM device attributes */
+static DEVICE_ATTR(scan, S_IWUSR, NULL, i2o_bus_store_scan);
+
+/**
+ *	i2o_bus_probe - verify if dev is a I2O Bus Adapter device and install it
+ *	@dev: device to verify if it is a I2O Bus Adapter device
+ *
+ *	Because we want all Bus Adapters always return 0.
+ *
+ *	Returns 0.
+ */
+static int i2o_bus_probe(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(get_device(dev));
+
+	device_create_file(dev, &dev_attr_scan);
+
+	osm_info("device added (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+};
+
+/**
+ *	i2o_bus_remove - remove the I2O Bus Adapter device from the system again
+ *	@dev: I2O Bus Adapter device which should be removed
+ *
+ *	Always returns 0.
+ */
+static int i2o_bus_remove(struct device *dev)
+{
+	struct i2o_device *i2o_dev = to_i2o_device(dev);
+
+	device_remove_file(dev, &dev_attr_scan);
+
+	put_device(dev);
+
+	osm_info("device removed (TID: %03x)\n", i2o_dev->lct_data.tid);
+
+	return 0;
+};
+
+/* Bus Adapter OSM driver struct */
+static struct i2o_driver i2o_bus_driver = {
+	.name = OSM_NAME,
+	.classes = i2o_bus_class_id,
+	.driver = {
+		   .probe = i2o_bus_probe,
+		   .remove = i2o_bus_remove,
+		   },
+};
+
+/**
+ *	i2o_bus_init - Bus Adapter OSM initialization function
+ *
+ *	Only register the Bus Adapter OSM in the I2O core.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_bus_init(void)
+{
+	int rc;
+
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	/* Register Bus Adapter OSM into I2O core */
+	rc = i2o_driver_register(&i2o_bus_driver);
+	if (rc) {
+		osm_err("Could not register Bus Adapter OSM\n");
+		return rc;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_bus_exit - Bus Adapter OSM exit function
+ *
+ *	Unregisters Bus Adapter OSM from I2O core.
+ */
+static void __exit i2o_bus_exit(void)
+{
+	i2o_driver_unregister(&i2o_bus_driver);
+};
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_bus_init);
+module_exit(i2o_bus_exit);
diff --git a/drivers/message/i2o/config-osm.c b/drivers/message/i2o/config-osm.c
new file mode 100644
index 000000000000..d0267609a949
--- /dev/null
+++ b/drivers/message/i2o/config-osm.c
@@ -0,0 +1,579 @@
+/*
+ *	Configuration OSM
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+#include <linux/module.h>
+#include <linux/i2o.h>
+#include <linux/namei.h>
+
+#include <asm/uaccess.h>
+
+#define OSM_NAME	"config-osm"
+#define OSM_VERSION	"1.248"
+#define OSM_DESCRIPTION	"I2O Configuration OSM"
+
+/* access mode user rw */
+#define S_IWRSR (S_IRUSR | S_IWUSR)
+
+static struct i2o_driver i2o_config_driver;
+
+/* Special file operations for sysfs */
+struct fops_attribute {
+	struct bin_attribute bin;
+	struct file_operations fops;
+};
+
+/**
+ *	sysfs_read_dummy
+ */
+static ssize_t sysfs_read_dummy(struct kobject *kobj, char *buf, loff_t offset,
+				size_t count)
+{
+	return 0;
+};
+
+/**
+ *	sysfs_write_dummy
+ */
+static ssize_t sysfs_write_dummy(struct kobject *kobj, char *buf, loff_t offset,
+				 size_t count)
+{
+	return 0;
+};
+
+/**
+ *	sysfs_create_fops_file - Creates attribute with special file operations
+ *	@kobj: kobject which should contains the attribute
+ *	@attr: attributes which should be used to create file
+ *
+ *	First creates attribute @attr in kobject @kobj. If it is the first time
+ *	this function is called, merge old fops from sysfs with new one and
+ *	write it back. Afterwords the new fops will be set for the created
+ *	attribute.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int sysfs_create_fops_file(struct kobject *kobj,
+				  struct fops_attribute *attr)
+{
+	struct file_operations tmp, *fops;
+	struct dentry *d;
+	struct qstr qstr;
+	int rc;
+
+	fops = &attr->fops;
+
+	if (fops->read)
+		attr->bin.read = sysfs_read_dummy;
+
+	if (fops->write)
+		attr->bin.write = sysfs_write_dummy;
+
+	if ((rc = sysfs_create_bin_file(kobj, &attr->bin)))
+		return rc;
+
+	qstr.name = attr->bin.attr.name;
+	qstr.len = strlen(qstr.name);
+	qstr.hash = full_name_hash(qstr.name, qstr.len);
+
+	if ((d = lookup_hash(&qstr, kobj->dentry))) {
+		if (!fops->owner) {
+			memcpy(&tmp, d->d_inode->i_fop, sizeof(tmp));
+			if (fops->read)
+				tmp.read = fops->read;
+			if (fops->write)
+				tmp.write = fops->write;
+			memcpy(fops, &tmp, sizeof(tmp));
+		}
+
+		d->d_inode->i_fop = fops;
+	} else
+		sysfs_remove_bin_file(kobj, &attr->bin);
+
+	return -ENOENT;
+};
+
+/**
+ *	sysfs_remove_fops_file - Remove attribute with special file operations
+ *	@kobj: kobject which contains the attribute
+ *	@attr: attributes which are used to create file
+ *
+ *	Only wrapper arround sysfs_remove_bin_file()
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static inline int sysfs_remove_fops_file(struct kobject *kobj,
+					 struct fops_attribute *attr)
+{
+	return sysfs_remove_bin_file(kobj, &attr->bin);
+};
+
+/**
+ *	i2o_config_read_hrt - Returns the HRT of the controller
+ *	@kob: kernel object handle
+ *	@buf: buffer into which the HRT should be copied
+ *	@off: file offset
+ *	@count: number of bytes to read
+ *
+ *	Put @count bytes starting at @off into @buf from the HRT of the I2O
+ *	controller corresponding to @kobj.
+ *
+ *	Returns number of bytes copied into buffer.
+ */
+static ssize_t i2o_config_read_hrt(struct kobject *kobj, char *buf,
+				   loff_t offset, size_t count)
+{
+	struct i2o_controller *c = kobj_to_i2o_device(kobj)->iop;
+	i2o_hrt *hrt = c->hrt.virt;
+
+	u32 size = (hrt->num_entries * hrt->entry_len + 2) * 4;
+
+	if (offset > size)
+		return 0;
+
+	if (offset + count > size)
+		count = size - offset;
+
+	memcpy(buf, (u8 *) hrt + offset, count);
+
+	return count;
+};
+
+/**
+ *	i2o_config_read_lct - Returns the LCT of the controller
+ *	@kob: kernel object handle
+ *	@buf: buffer into which the LCT should be copied
+ *	@off: file offset
+ *	@count: number of bytes to read
+ *
+ *	Put @count bytes starting at @off into @buf from the LCT of the I2O
+ *	controller corresponding to @kobj.
+ *
+ *	Returns number of bytes copied into buffer.
+ */
+static ssize_t i2o_config_read_lct(struct kobject *kobj, char *buf,
+				   loff_t offset, size_t count)
+{
+	struct i2o_controller *c = kobj_to_i2o_device(kobj)->iop;
+	u32 size = c->lct->table_size * 4;
+
+	if (offset > size)
+		return 0;
+
+	if (offset + count > size)
+		count = size - offset;
+
+	memcpy(buf, (u8 *) c->lct + offset, count);
+
+	return count;
+};
+
+#define I2O_CONFIG_SW_ATTR(_name,_mode,_type,_swid) \
+static ssize_t i2o_config_##_name##_read(struct file *file, char __user *buf, size_t count, loff_t * offset) { \
+	return i2o_config_sw_read(file, buf, count, offset, _type, _swid); \
+};\
+\
+static ssize_t i2o_config_##_name##_write(struct file *file, const char __user *buf, size_t count, loff_t * offset) { \
+	return i2o_config_sw_write(file, buf, count, offset, _type, _swid); \
+}; \
+\
+static struct fops_attribute i2o_config_attr_##_name = { \
+	.bin = { .attr = { .name = __stringify(_name), .mode = _mode, \
+			   .owner = THIS_MODULE }, \
+		 .size = 0, }, \
+	.fops = { .write = i2o_config_##_name##_write, \
+		  .read = i2o_config_##_name##_read} \
+};
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+
+/**
+ *	i2o_config_dpt_reagion - Converts type and id to flash region
+ *	@swtype: type of software module reading
+ *	@swid: id of software which should be read
+ *
+ *	Converts type and id from I2O spec to the matching region for DPT /
+ *	Adaptec controllers.
+ *
+ *	Returns region which match type and id or -1 on error.
+ */
+static u32 i2o_config_dpt_region(u8 swtype, u8 swid)
+{
+	switch (swtype) {
+	case I2O_SOFTWARE_MODULE_IRTOS:
+		/*
+		 * content: operation firmware
+		 * region size:
+		 *      0xbc000 for 2554, 3754, 2564, 3757
+		 *      0x170000 for 2865
+		 *      0x17c000 for 3966
+		 */
+		if (!swid)
+			return 0;
+
+		break;
+
+	case I2O_SOFTWARE_MODULE_IOP_PRIVATE:
+		/*
+		 * content: BIOS and SMOR
+		 * BIOS size: first 0x8000 bytes
+		 * region size:
+		 *      0x40000 for 2554, 3754, 2564, 3757
+		 *      0x80000 for 2865, 3966
+		 */
+		if (!swid)
+			return 1;
+
+		break;
+
+	case I2O_SOFTWARE_MODULE_IOP_CONFIG:
+		switch (swid) {
+		case 0:
+			/*
+			 * content: NVRAM defaults
+			 * region size: 0x2000 bytes
+			 */
+			return 2;
+		case 1:
+			/*
+			 * content: serial number
+			 * region size: 0x2000 bytes
+			 */
+			return 3;
+		}
+		break;
+	}
+
+	return -1;
+};
+
+#endif
+
+/**
+ *	i2o_config_sw_read - Read a software module from controller
+ *	@file: file pointer
+ *	@buf: buffer into which the data should be copied
+ *	@count: number of bytes to read
+ *	@off: file offset
+ *	@swtype: type of software module reading
+ *	@swid: id of software which should be read
+ *
+ *	Transfers @count bytes at offset @offset from IOP into buffer using
+ *	type @swtype and id @swid as described in I2O spec.
+ *
+ *	Returns number of bytes copied into buffer or error code on failure.
+ */
+static ssize_t i2o_config_sw_read(struct file *file, char __user * buf,
+				  size_t count, loff_t * offset, u8 swtype,
+				  u32 swid)
+{
+	struct sysfs_dirent *sd = file->f_dentry->d_parent->d_fsdata;
+	struct kobject *kobj = sd->s_element;
+	struct i2o_controller *c = kobj_to_i2o_device(kobj)->iop;
+	u32 m, function = I2O_CMD_SW_UPLOAD;
+	struct i2o_dma buffer;
+	struct i2o_message __iomem *msg;
+	u32 __iomem *mptr;
+	int rc, status;
+
+	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	if (m == I2O_QUEUE_EMPTY)
+		return -EBUSY;
+
+	mptr = &msg->body[3];
+
+	if ((rc = i2o_dma_alloc(&c->pdev->dev, &buffer, count, GFP_KERNEL))) {
+		i2o_msg_nop(c, m);
+		return rc;
+	}
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		mptr = &msg->body[4];
+		function = I2O_CMD_PRIVATE;
+
+		writel(TEN_WORD_MSG_SIZE | SGL_OFFSET_8, &msg->u.head[0]);
+
+		writel(I2O_VENDOR_DPT << 16 | I2O_DPT_FLASH_READ,
+		       &msg->body[0]);
+		writel(i2o_config_dpt_region(swtype, swid), &msg->body[1]);
+		writel(*offset, &msg->body[2]);
+		writel(count, &msg->body[3]);
+	} else
+#endif
+		writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
+
+	writel(0xD0000000 | count, mptr++);
+	writel(buffer.phys, mptr);
+
+	writel(function << 24 | HOST_TID << 12 | ADAPTER_TID, &msg->u.head[1]);
+	writel(i2o_config_driver.context, &msg->u.head[2]);
+	writel(0, &msg->u.head[3]);
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (!c->adaptec)
+#endif
+	{
+		writel((u32) swtype << 16 | (u32) 1 << 8, &msg->body[0]);
+		writel(0, &msg->body[1]);
+		writel(swid, &msg->body[2]);
+	}
+
+	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+
+	if (status == I2O_POST_WAIT_OK) {
+		if (!(rc = copy_to_user(buf, buffer.virt, count))) {
+			rc = count;
+			*offset += count;
+		}
+	} else
+		rc = -EIO;
+
+	if (status != -ETIMEDOUT)
+		i2o_dma_free(&c->pdev->dev, &buffer);
+
+	return rc;
+};
+
+/**
+ *	i2o_config_sw_write - Write a software module to controller
+ *	@file: file pointer
+ *	@buf: buffer into which the data should be copied
+ *	@count: number of bytes to read
+ *	@off: file offset
+ *	@swtype: type of software module writing
+ *	@swid: id of software which should be written
+ *
+ *	Transfers @count bytes at offset @offset from buffer to IOP using
+ *	type @swtype and id @swid as described in I2O spec.
+ *
+ *	Returns number of bytes copied from buffer or error code on failure.
+ */
+static ssize_t i2o_config_sw_write(struct file *file, const char __user * buf,
+				   size_t count, loff_t * offset, u8 swtype,
+				   u32 swid)
+{
+	struct sysfs_dirent *sd = file->f_dentry->d_parent->d_fsdata;
+	struct kobject *kobj = sd->s_element;
+	struct i2o_controller *c = kobj_to_i2o_device(kobj)->iop;
+	u32 m, function = I2O_CMD_SW_DOWNLOAD;
+	struct i2o_dma buffer;
+	struct i2o_message __iomem *msg;
+	u32 __iomem *mptr;
+	int rc, status;
+
+	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	if (m == I2O_QUEUE_EMPTY)
+		return -EBUSY;
+
+	mptr = &msg->body[3];
+
+	if ((rc = i2o_dma_alloc(&c->pdev->dev, &buffer, count, GFP_KERNEL)))
+		goto nop_msg;
+
+	if ((rc = copy_from_user(buffer.virt, buf, count)))
+		goto free_buffer;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		mptr = &msg->body[4];
+		function = I2O_CMD_PRIVATE;
+
+		writel(TEN_WORD_MSG_SIZE | SGL_OFFSET_8, &msg->u.head[0]);
+
+		writel(I2O_VENDOR_DPT << 16 | I2O_DPT_FLASH_WRITE,
+		       &msg->body[0]);
+		writel(i2o_config_dpt_region(swtype, swid), &msg->body[1]);
+		writel(*offset, &msg->body[2]);
+		writel(count, &msg->body[3]);
+	} else
+#endif
+		writel(NINE_WORD_MSG_SIZE | SGL_OFFSET_7, &msg->u.head[0]);
+
+	writel(0xD4000000 | count, mptr++);
+	writel(buffer.phys, mptr);
+
+	writel(function << 24 | HOST_TID << 12 | ADAPTER_TID, &msg->u.head[1]);
+	writel(i2o_config_driver.context, &msg->u.head[2]);
+	writel(0, &msg->u.head[3]);
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (!c->adaptec)
+#endif
+	{
+		writel((u32) swtype << 16 | (u32) 1 << 8, &msg->body[0]);
+		writel(0, &msg->body[1]);
+		writel(swid, &msg->body[2]);
+	}
+
+	status = i2o_msg_post_wait_mem(c, m, 60, &buffer);
+
+	if (status != -ETIMEDOUT)
+		i2o_dma_free(&c->pdev->dev, &buffer);
+
+	if (status != I2O_POST_WAIT_OK)
+		return -EIO;
+
+	*offset += count;
+
+	return count;
+
+      free_buffer:
+	i2o_dma_free(&c->pdev->dev, &buffer);
+
+      nop_msg:
+	i2o_msg_nop(c, m);
+
+	return rc;
+};
+
+/* attribute for HRT in sysfs */
+static struct bin_attribute i2o_config_hrt_attr = {
+	.attr = {
+		 .name = "hrt",
+		 .mode = S_IRUGO,
+		 .owner = THIS_MODULE},
+	.size = 0,
+	.read = i2o_config_read_hrt
+};
+
+/* attribute for LCT in sysfs */
+static struct bin_attribute i2o_config_lct_attr = {
+	.attr = {
+		 .name = "lct",
+		 .mode = S_IRUGO,
+		 .owner = THIS_MODULE},
+	.size = 0,
+	.read = i2o_config_read_lct
+};
+
+/* IRTOS firmware access */
+I2O_CONFIG_SW_ATTR(irtos, S_IWRSR, I2O_SOFTWARE_MODULE_IRTOS, 0);
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+
+/*
+ * attribute for BIOS / SMOR, nvram and serial number access on DPT / Adaptec
+ * controllers
+ */
+I2O_CONFIG_SW_ATTR(bios, S_IWRSR, I2O_SOFTWARE_MODULE_IOP_PRIVATE, 0);
+I2O_CONFIG_SW_ATTR(nvram, S_IWRSR, I2O_SOFTWARE_MODULE_IOP_CONFIG, 0);
+I2O_CONFIG_SW_ATTR(serial, S_IWRSR, I2O_SOFTWARE_MODULE_IOP_CONFIG, 1);
+
+#endif
+
+/**
+ *	i2o_config_notify_controller_add - Notify of added controller
+ *	@c: the controller which was added
+ *
+ *	If a I2O controller is added, we catch the notification to add sysfs
+ *	entries.
+ */
+static void i2o_config_notify_controller_add(struct i2o_controller *c)
+{
+	struct kobject *kobj = &c->exec->device.kobj;
+
+	sysfs_create_bin_file(kobj, &i2o_config_hrt_attr);
+	sysfs_create_bin_file(kobj, &i2o_config_lct_attr);
+
+	sysfs_create_fops_file(kobj, &i2o_config_attr_irtos);
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		sysfs_create_fops_file(kobj, &i2o_config_attr_bios);
+		sysfs_create_fops_file(kobj, &i2o_config_attr_nvram);
+		sysfs_create_fops_file(kobj, &i2o_config_attr_serial);
+	}
+#endif
+};
+
+/**
+ *	i2o_config_notify_controller_remove - Notify of removed controller
+ *	@c: the controller which was removed
+ *
+ *	If a I2O controller is removed, we catch the notification to remove the
+ *	sysfs entries.
+ */
+static void i2o_config_notify_controller_remove(struct i2o_controller *c)
+{
+	struct kobject *kobj = &c->exec->device.kobj;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		sysfs_remove_fops_file(kobj, &i2o_config_attr_serial);
+		sysfs_remove_fops_file(kobj, &i2o_config_attr_nvram);
+		sysfs_remove_fops_file(kobj, &i2o_config_attr_bios);
+	}
+#endif
+	sysfs_remove_fops_file(kobj, &i2o_config_attr_irtos);
+
+	sysfs_remove_bin_file(kobj, &i2o_config_lct_attr);
+	sysfs_remove_bin_file(kobj, &i2o_config_hrt_attr);
+};
+
+/* Config OSM driver struct */
+static struct i2o_driver i2o_config_driver = {
+	.name = OSM_NAME,
+	.notify_controller_add = i2o_config_notify_controller_add,
+	.notify_controller_remove = i2o_config_notify_controller_remove
+};
+
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+#include "i2o_config.c"
+#endif
+
+/**
+ *	i2o_config_init - Configuration OSM initialization function
+ *
+ *	Registers Configuration OSM in the I2O core and if old ioctl's are
+ *	compiled in initialize them.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+static int __init i2o_config_init(void)
+{
+	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
+
+	if (i2o_driver_register(&i2o_config_driver)) {
+		osm_err("handler register failed.\n");
+		return -EBUSY;
+	}
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	if (i2o_config_old_init())
+		i2o_driver_unregister(&i2o_config_driver);
+#endif
+
+	return 0;
+}
+
+/**
+ *	i2o_config_exit - Configuration OSM exit function
+ *
+ *	If old ioctl's are compiled in exit remove them and unregisters
+ *	Configuration OSM from I2O core.
+ */
+static void i2o_config_exit(void)
+{
+#ifdef CONFIG_I2O_CONFIG_OLD_IOCTL
+	i2o_config_old_exit();
+#endif
+
+	i2o_driver_unregister(&i2o_config_driver);
+}
+
+MODULE_AUTHOR("Markus Lidel <Markus.Lidel@shadowconnect.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION(OSM_DESCRIPTION);
+MODULE_VERSION(OSM_VERSION);
+
+module_init(i2o_config_init);
+module_exit(i2o_config_exit);
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index bebdd509b5d8..393be8e2914c 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -180,7 +180,13 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 {
 	struct i2o_driver *drv;
 	struct i2o_message __iomem *msg = i2o_msg_out_to_virt(c, m);
-	u32 context = readl(&msg->u.s.icntxt);
+	u32 context;
+	unsigned long flags;
+
+	if(unlikely(!msg))
+		return -EIO;
+
+	context = readl(&msg->u.s.icntxt);
 
 	if (unlikely(context >= i2o_max_drivers)) {
 		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
@@ -188,9 +194,9 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 		return -EIO;
 	}
 
-	spin_lock(&i2o_drivers_lock);
+	spin_lock_irqsave(&i2o_drivers_lock, flags);
 	drv = i2o_drivers[context];
-	spin_unlock(&i2o_drivers_lock);
+	spin_unlock_irqrestore(&i2o_drivers_lock, flags);
 
 	if (unlikely(!drv)) {
 		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 5581344fbba6..0160221c802a 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -206,6 +206,7 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 				      u32 context)
 {
 	struct i2o_exec_wait *wait, *tmp;
+	unsigned long flags;
 	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
 	int rc = 1;
 
@@ -216,11 +217,13 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 	 * already expired. Not much we can do about that except log it for
 	 * debug purposes, increase timeout, and recompile.
 	 */
-	spin_lock(&lock);
+	spin_lock_irqsave(&lock, flags);
 	list_for_each_entry_safe(wait, tmp, &i2o_exec_wait_list, list) {
 		if (wait->tcntxt == context) {
 			list_del(&wait->list);
 
+			spin_unlock_irqrestore(&lock, flags);
+
 			wait->m = m;
 			wait->msg = msg;
 			wait->complete = 1;
@@ -242,13 +245,11 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 				rc = -1;
 			}
 
-			spin_unlock(&lock);
-
 			return rc;
 		}
 	}
 
-	spin_unlock(&lock);
+	spin_unlock_irqrestore(&lock, flags);
 
 	osm_warn("%s: Bogus reply in POST WAIT (tr-context: %08x)!\n", c->name,
 		 context);
@@ -256,6 +257,50 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 	return -1;
 };
 
+/**
+ *	i2o_exec_show_vendor_id - Displays Vendor ID of controller
+ *	@d: device of which the Vendor ID should be displayed
+ *	@buf: buffer into which the Vendor ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_vendor_id(struct device *d, char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (i2o_parm_field_get(dev, 0x0000, 0, &id, 2)) {
+		sprintf(buf, "0x%04x", id);
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/**
+ *	i2o_exec_show_product_id - Displays Product ID of controller
+ *	@d: device of which the Product ID should be displayed
+ *	@buf: buffer into which the Product ID should be printed
+ *
+ *	Returns number of bytes printed into buffer.
+ */
+static ssize_t i2o_exec_show_product_id(struct device *d, char *buf)
+{
+	struct i2o_device *dev = to_i2o_device(d);
+	u16 id;
+
+	if (i2o_parm_field_get(dev, 0x0000, 1, &id, 2)) {
+		sprintf(buf, "0x%04x", id);
+		return strlen(buf) + 1;
+	}
+
+	return 0;
+};
+
+/* Exec-OSM device attributes */
+static DEVICE_ATTR(vendor_id, S_IRUGO, i2o_exec_show_vendor_id, NULL);
+static DEVICE_ATTR(product_id, S_IRUGO, i2o_exec_show_product_id, NULL);
+
 /**
  *	i2o_exec_probe - Called if a new I2O device (executive class) appears
  *	@dev: I2O device which should be probed
@@ -268,10 +313,16 @@ static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
 static int i2o_exec_probe(struct device *dev)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
+	struct i2o_controller *c = i2o_dev->iop;
 
 	i2o_event_register(i2o_dev, &i2o_exec_driver, 0, 0xffffffff);
 
-	i2o_dev->iop->exec = i2o_dev;
+	c->exec = i2o_dev;
+
+	i2o_exec_lct_notify(c, c->lct->change_ind + 1);
+
+	device_create_file(dev, &dev_attr_vendor_id);
+	device_create_file(dev, &dev_attr_product_id);
 
 	return 0;
 };
@@ -286,6 +337,9 @@ static int i2o_exec_probe(struct device *dev)
  */
 static int i2o_exec_remove(struct device *dev)
 {
+	device_remove_file(dev, &dev_attr_product_id);
+	device_remove_file(dev, &dev_attr_vendor_id);
+
 	i2o_event_register(to_i2o_device(dev), &i2o_exec_driver, 0, 0);
 
 	return 0;
@@ -297,12 +351,16 @@ static int i2o_exec_remove(struct device *dev)
  *
  *	This function handles asynchronus LCT NOTIFY replies. It parses the
  *	new LCT and if the buffer for the LCT was to small sends a LCT NOTIFY
- *	again.
+ *	again, otherwise send LCT NOTIFY to get informed on next LCT change.
  */
 static void i2o_exec_lct_modified(struct i2o_controller *c)
 {
-	if (i2o_device_parse_lct(c) == -EAGAIN)
-		i2o_exec_lct_notify(c, 0);
+	u32 change_ind = 0;
+
+	if (i2o_device_parse_lct(c) != -EAGAIN)
+		change_ind = c->lct->change_ind + 1;
+
+	i2o_exec_lct_notify(c, change_ind);
 };
 
 /**
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index e69421e36ac5..1dd2b9dad50e 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -146,6 +146,29 @@ static int i2o_block_device_flush(struct i2o_device *dev)
 	return i2o_msg_post_wait(dev->iop, m, 60);
 };
 
+/**
+ *	i2o_block_issue_flush - device-flush interface for block-layer
+ *	@queue: the request queue of the device which should be flushed
+ *	@disk: gendisk
+ *	@error_sector: error offset
+ *
+ *	Helper function to provide flush functionality to block-layer.
+ *
+ *	Returns 0 on success or negative error code on failure.
+ */
+
+static int i2o_block_issue_flush(request_queue_t * queue, struct gendisk *disk,
+				 sector_t * error_sector)
+{
+	struct i2o_block_device *i2o_blk_dev = queue->queuedata;
+	int rc = -ENODEV;
+
+	if (likely(i2o_blk_dev))
+		rc = i2o_block_device_flush(i2o_blk_dev->i2o_dev);
+
+	return rc;
+}
+
 /**
  *	i2o_block_device_mount - Mount (load) the media of device dev
  *	@dev: I2O device which should receive the mount request
@@ -299,28 +322,31 @@ static inline void i2o_block_request_free(struct i2o_block_request *ireq)
 
 /**
  *	i2o_block_sglist_alloc - Allocate the SG list and map it
+ *	@c: I2O controller to which the request belongs
  *	@ireq: I2O block request
  *
- *	Builds the SG list and map it into to be accessable by the controller.
+ *	Builds the SG list and map it to be accessable by the controller.
  *
- *	Returns the number of elements in the SG list or 0 on failure.
+ *	Returns 0 on failure or 1 on success.
  */
-static inline int i2o_block_sglist_alloc(struct i2o_block_request *ireq)
+static inline int i2o_block_sglist_alloc(struct i2o_controller *c,
+					 struct i2o_block_request *ireq,
+					 u32 __iomem ** mptr)
 {
-	struct device *dev = &ireq->i2o_blk_dev->i2o_dev->iop->pdev->dev;
 	int nents;
+	enum dma_data_direction direction;
 
+	ireq->dev = &c->pdev->dev;
 	nents = blk_rq_map_sg(ireq->req->q, ireq->req, ireq->sg_table);
 
 	if (rq_data_dir(ireq->req) == READ)
-		ireq->sg_dma_direction = PCI_DMA_FROMDEVICE;
+		direction = PCI_DMA_FROMDEVICE;
 	else
-		ireq->sg_dma_direction = PCI_DMA_TODEVICE;
+		direction = PCI_DMA_TODEVICE;
 
-	ireq->sg_nents = dma_map_sg(dev, ireq->sg_table, nents,
-				    ireq->sg_dma_direction);
+	ireq->sg_nents = nents;
 
-	return ireq->sg_nents;
+	return i2o_dma_map_sg(c, ireq->sg_table, nents, direction, mptr);
 };
 
 /**
@@ -331,10 +357,14 @@ static inline int i2o_block_sglist_alloc(struct i2o_block_request *ireq)
  */
 static inline void i2o_block_sglist_free(struct i2o_block_request *ireq)
 {
-	struct device *dev = &ireq->i2o_blk_dev->i2o_dev->iop->pdev->dev;
+	enum dma_data_direction direction;
 
-	dma_unmap_sg(dev, ireq->sg_table, ireq->sg_nents,
-		     ireq->sg_dma_direction);
+	if (rq_data_dir(ireq->req) == READ)
+		direction = PCI_DMA_FROMDEVICE;
+	else
+		direction = PCI_DMA_TODEVICE;
+
+	dma_unmap_sg(ireq->dev, ireq->sg_table, ireq->sg_nents, direction);
 };
 
 /**
@@ -352,6 +382,11 @@ static int i2o_block_prep_req_fn(struct request_queue *q, struct request *req)
 	struct i2o_block_device *i2o_blk_dev = q->queuedata;
 	struct i2o_block_request *ireq;
 
+	if (unlikely(!i2o_blk_dev)) {
+		osm_err("block device already removed\n");
+		return BLKPREP_KILL;
+	}
+
 	/* request is already processed by us, so return */
 	if (req->flags & REQ_SPECIAL) {
 		osm_debug("REQ_SPECIAL already set!\n");
@@ -414,11 +449,11 @@ static void i2o_block_end_request(struct request *req, int uptodate,
 {
 	struct i2o_block_request *ireq = req->special;
 	struct i2o_block_device *dev = ireq->i2o_blk_dev;
-	request_queue_t *q = dev->gd->queue;
+	request_queue_t *q = req->q;
 	unsigned long flags;
 
 	if (end_that_request_chunk(req, uptodate, nr_bytes)) {
-		int leftover = (req->hard_nr_sectors << 9);
+		int leftover = (req->hard_nr_sectors << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
 			leftover = req->data_len;
@@ -432,8 +467,11 @@ static void i2o_block_end_request(struct request *req, int uptodate,
 	spin_lock_irqsave(q->queue_lock, flags);
 
 	end_that_request_last(req);
-	dev->open_queue_depth--;
-	list_del(&ireq->queue);
+
+	if (likely(dev)) {
+		dev->open_queue_depth--;
+		list_del(&ireq->queue);
+	}
 
 	blk_start_queue(q);
 
@@ -483,8 +521,8 @@ static int i2o_block_reply(struct i2o_controller *c, u32 m,
 		 *      Don't stick a supertrak100 into cache aggressive modes
 		 */
 
-		osm_err("%03x error status: %02x, detailed status: %04x\n",
-			(le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
+		osm_err("TID %03x error status: 0x%02x, detailed status: "
+			"0x%04x\n", (le32_to_cpu(msg->u.head[1]) >> 12 & 0xfff),
 			status >> 24, status & 0xffff);
 
 		req->errors++;
@@ -705,18 +743,25 @@ static int i2o_block_media_changed(struct gendisk *disk)
 static int i2o_block_transfer(struct request *req)
 {
 	struct i2o_block_device *dev = req->rq_disk->private_data;
-	struct i2o_controller *c = dev->i2o_dev->iop;
+	struct i2o_controller *c;
 	int tid = dev->i2o_dev->lct_data.tid;
 	struct i2o_message __iomem *msg;
-	void __iomem *mptr;
+	u32 __iomem *mptr;
 	struct i2o_block_request *ireq = req->special;
-	struct scatterlist *sg;
-	int sgnum;
-	int i;
 	u32 m;
 	u32 tcntxt;
-	u32 sg_flags;
+	u32 sgl_offset = SGL_OFFSET_8;
+	u32 ctl_flags = 0x00000000;
 	int rc;
+	u32 cmd;
+
+	if (unlikely(!dev->i2o_dev)) {
+		osm_err("transfer to removed drive\n");
+		rc = -ENODEV;
+		goto exit;
+	}
+
+	c = dev->i2o_dev->iop;
 
 	m = i2o_msg_get(c, &msg);
 	if (m == I2O_QUEUE_EMPTY) {
@@ -730,80 +775,109 @@ static int i2o_block_transfer(struct request *req)
 		goto nop_msg;
 	}
 
-	if ((sgnum = i2o_block_sglist_alloc(ireq)) <= 0) {
-		rc = -ENOMEM;
-		goto context_remove;
-	}
-
-	/* Build the message based on the request. */
 	writel(i2o_block_driver.context, &msg->u.s.icntxt);
 	writel(tcntxt, &msg->u.s.tcntxt);
-	writel(req->nr_sectors << 9, &msg->body[1]);
 
-	writel((((u64) req->sector) << 9) & 0xffffffff, &msg->body[2]);
-	writel(req->sector >> 23, &msg->body[3]);
-
-	mptr = &msg->body[4];
-
-	sg = ireq->sg_table;
+	mptr = &msg->body[0];
 
 	if (rq_data_dir(req) == READ) {
-		writel(I2O_CMD_BLOCK_READ << 24 | HOST_TID << 12 | tid,
-		       &msg->u.head[1]);
-		sg_flags = 0x10000000;
+		cmd = I2O_CMD_BLOCK_READ << 24;
+
 		switch (dev->rcache) {
-		case CACHE_NULL:
-			writel(0, &msg->body[0]);
-			break;
 		case CACHE_PREFETCH:
-			writel(0x201F0008, &msg->body[0]);
+			ctl_flags = 0x201F0008;
 			break;
+
 		case CACHE_SMARTFETCH:
 			if (req->nr_sectors > 16)
-				writel(0x201F0008, &msg->body[0]);
+				ctl_flags = 0x201F0008;
 			else
-				writel(0x001F0000, &msg->body[0]);
+				ctl_flags = 0x001F0000;
+			break;
+
+		default:
 			break;
 		}
 	} else {
-		writel(I2O_CMD_BLOCK_WRITE << 24 | HOST_TID << 12 | tid,
-		       &msg->u.head[1]);
-		sg_flags = 0x14000000;
+		cmd = I2O_CMD_BLOCK_WRITE << 24;
+
 		switch (dev->wcache) {
-		case CACHE_NULL:
-			writel(0, &msg->body[0]);
-			break;
 		case CACHE_WRITETHROUGH:
-			writel(0x001F0008, &msg->body[0]);
+			ctl_flags = 0x001F0008;
 			break;
 		case CACHE_WRITEBACK:
-			writel(0x001F0010, &msg->body[0]);
+			ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTBACK:
 			if (req->nr_sectors > 16)
-				writel(0x001F0004, &msg->body[0]);
+				ctl_flags = 0x001F0004;
 			else
-				writel(0x001F0010, &msg->body[0]);
+				ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTTHROUGH:
 			if (req->nr_sectors > 16)
-				writel(0x001F0004, &msg->body[0]);
+				ctl_flags = 0x001F0004;
 			else
-				writel(0x001F0010, &msg->body[0]);
+				ctl_flags = 0x001F0010;
+		default:
+			break;
+		}
+	}
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u8 cmd[10];
+		u32 scsi_flags;
+		u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT;
+
+		memset(cmd, 0, 10);
+
+		sgl_offset = SGL_OFFSET_12;
+
+		writel(I2O_CMD_PRIVATE << 24 | HOST_TID << 12 | tid,
+		       &msg->u.head[1]);
+
+		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
+		writel(tid, mptr++);
+
+		/*
+		 * ENABLE_DISCONNECT
+		 * SIMPLE_TAG
+		 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+		 */
+		if (rq_data_dir(req) == READ) {
+			cmd[0] = 0x28;
+			scsi_flags = 0x60a0000a;
+		} else {
+			cmd[0] = 0x2A;
+			scsi_flags = 0xa0a0000a;
 		}
+
+		writel(scsi_flags, mptr++);
+
+		*((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
+		*((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
+
+		memcpy_toio(mptr, cmd, 10);
+		mptr += 4;
+		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
+	} else
+#endif
+	{
+		writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
+		writel(ctl_flags, mptr++);
+		writel(req->nr_sectors << KERNEL_SECTOR_SHIFT, mptr++);
+		writel((u32) (req->sector << KERNEL_SECTOR_SHIFT), mptr++);
+		writel(req->sector >> (32 - KERNEL_SECTOR_SHIFT), mptr++);
 	}
 
-	for (i = sgnum; i > 0; i--) {
-		if (i == 1)
-			sg_flags |= 0x80000000;
-		writel(sg_flags | sg_dma_len(sg), mptr);
-		writel(sg_dma_address(sg), mptr + 4);
-		mptr += 8;
-		sg++;
+	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
+		rc = -ENOMEM;
+		goto context_remove;
 	}
 
-	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | SGL_OFFSET_8,
-	       &msg->u.head[0]);
+	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) |
+	       sgl_offset, &msg->u.head[0]);
 
 	list_add_tail(&ireq->queue, &dev->open_queue);
 	dev->open_queue_depth++;
@@ -846,11 +920,13 @@ static void i2o_block_request_fn(struct request_queue *q)
 
 			queue_depth = ireq->i2o_blk_dev->open_queue_depth;
 
-			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS)
+			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
 				if (!i2o_block_transfer(req)) {
 					blkdev_dequeue_request(req);
 					continue;
-				}
+				} else
+					osm_info("transfer error\n");
+			}
 
 			if (queue_depth)
 				break;
@@ -933,6 +1009,7 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
 	}
 
 	blk_queue_prep_rq(queue, i2o_block_prep_req_fn);
+	blk_queue_issue_flush_fn(queue, i2o_block_issue_flush);
 
 	gd->major = I2O_MAJOR;
 	gd->queue = queue;
@@ -974,7 +1051,18 @@ static int i2o_block_probe(struct device *dev)
 	u64 size;
 	u32 blocksize;
 	u32 flags, status;
-	int segments;
+	u16 body_size = 4;
+	unsigned short max_sectors;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
+
+	if (c->limit_sectors)
+		max_sectors = I2O_MAX_SECTORS_LIMITED;
+	else
+		max_sectors = I2O_MAX_SECTORS;
 
 	/* skip devices which are used by IOP */
 	if (i2o_dev->lct_data.user_tid != 0xfff) {
@@ -1009,50 +1097,35 @@ static int i2o_block_probe(struct device *dev)
 	queue = gd->queue;
 	queue->queuedata = i2o_blk_dev;
 
-	blk_queue_max_phys_segments(queue, I2O_MAX_SEGMENTS);
-	blk_queue_max_sectors(queue, I2O_MAX_SECTORS);
-
-	if (c->short_req)
-		segments = 8;
-	else {
-		i2o_status_block *sb;
+	blk_queue_max_phys_segments(queue, I2O_MAX_PHYS_SEGMENTS);
+	blk_queue_max_sectors(queue, max_sectors);
+	blk_queue_max_hw_segments(queue, i2o_sg_tablesize(c, body_size));
 
-		sb = c->status_block.virt;
-
-		segments = (sb->inbound_frame_size -
-			    sizeof(struct i2o_message) / 4 - 4) / 2;
-	}
-
-	blk_queue_max_hw_segments(queue, segments);
-
-	osm_debug("max sectors = %d\n", I2O_MAX_SECTORS);
-	osm_debug("phys segments = %d\n", I2O_MAX_SEGMENTS);
-	osm_debug("hw segments = %d\n", segments);
+	osm_debug("max sectors = %d\n", queue->max_phys_segments);
+	osm_debug("phys segments = %d\n", queue->max_sectors);
+	osm_debug("max hw segments = %d\n", queue->max_hw_segments);
 
 	/*
 	 *      Ask for the current media data. If that isn't supported
 	 *      then we ask for the device capacity data
 	 */
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8))
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
-			osm_warn("could not get size of %s\n", gd->disk_name);
-			size = 0;
-		}
+	if (i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
+	    i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
+		blk_queue_hardsect_size(queue, blocksize);
+	} else
+		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
-	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4))
-		if (!i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-			osm_warn("unable to get blocksize of %s\n",
-				 gd->disk_name);
-			blocksize = 0;
-		}
+	if (i2o_parm_field_get(i2o_dev, 0x0004, 0, &size, 8) ||
+	    i2o_parm_field_get(i2o_dev, 0x0000, 4, &size, 8)) {
+		set_capacity(gd, size >> KERNEL_SECTOR_SHIFT);
+	} else
+		osm_warn("could not get size of %s\n", gd->disk_name);
 
 	if (!i2o_parm_field_get(i2o_dev, 0x0000, 2, &i2o_blk_dev->power, 2))
 		i2o_blk_dev->power = 0;
 	i2o_parm_field_get(i2o_dev, 0x0000, 5, &flags, 4);
 	i2o_parm_field_get(i2o_dev, 0x0000, 6, &status, 4);
 
-	set_capacity(gd, size >> 9);
-
 	i2o_event_register(i2o_dev, &i2o_block_driver, 0, 0xffffffff);
 
 	add_disk(gd);
@@ -1109,7 +1182,7 @@ static int __init i2o_block_init(void)
 		goto exit;
 	}
 
-	i2o_blk_req_pool.pool = mempool_create(I2O_REQ_MEMPOOL_SIZE,
+	i2o_blk_req_pool.pool = mempool_create(I2O_BLOCK_REQ_MEMPOOL_SIZE,
 					       mempool_alloc_slab,
 					       mempool_free_slab,
 					       i2o_blk_req_pool.slab);
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
index 712111ffa638..9e1a95fb0833 100644
--- a/drivers/message/i2o/i2o_block.h
+++ b/drivers/message/i2o/i2o_block.h
@@ -84,9 +84,9 @@ struct i2o_block_request
 	struct list_head queue;
 	struct request *req;		/* corresponding request */
 	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
-	int sg_dma_direction;		/* direction of DMA buffer read/write */
+	struct device *dev;		/* device used for DMA */
 	int sg_nents;			/* number of SG elements */
-	struct scatterlist sg_table[I2O_MAX_SEGMENTS]; /* SG table */
+	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS]; /* SG table */
 };
 
 /* I2O Block device delayed request */
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 383e89a5c9f0..849d90aad779 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -30,27 +30,11 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/i2o.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/miscdevice.h>
-#include <linux/mm.h>
-#include <linux/spinlock.h>
 #include <linux/smp_lock.h>
-#include <linux/ioctl32.h>
 #include <linux/compat.h>
-#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
-#include <asm/io.h>
-
-#define OSM_NAME	"config-osm"
-#define OSM_VERSION	"$Rev$"
-#define OSM_DESCRIPTION	"I2O Configuration OSM"
 
 extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
 
@@ -80,125 +64,6 @@ struct i2o_cfg_info {
 static struct i2o_cfg_info *open_files = NULL;
 static ulong i2o_cfg_info_id = 0;
 
-/**
- *	i2o_config_read_hrt - Returns the HRT of the controller
- *	@kob: kernel object handle
- *	@buf: buffer into which the HRT should be copied
- *	@off: file offset
- *	@count: number of bytes to read
- *
- *	Put @count bytes starting at @off into @buf from the HRT of the I2O
- *	controller corresponding to @kobj.
- *
- *	Returns number of bytes copied into buffer.
- */
-static ssize_t i2o_config_read_hrt(struct kobject *kobj, char *buf,
-				   loff_t offset, size_t count)
-{
-	struct i2o_controller *c = to_i2o_controller(container_of(kobj,
-								  struct device,
-								  kobj));
-	i2o_hrt *hrt = c->hrt.virt;
-
-	u32 size = (hrt->num_entries * hrt->entry_len + 2) * 4;
-
-	if(offset > size)
-		return 0;
-
-	if(offset + count > size)
-		count = size - offset;
-
-	memcpy(buf, (u8 *) hrt + offset, count);
-
-	return count;
-};
-
-/**
- *	i2o_config_read_lct - Returns the LCT of the controller
- *	@kob: kernel object handle
- *	@buf: buffer into which the LCT should be copied
- *	@off: file offset
- *	@count: number of bytes to read
- *
- *	Put @count bytes starting at @off into @buf from the LCT of the I2O
- *	controller corresponding to @kobj.
- *
- *	Returns number of bytes copied into buffer.
- */
-static ssize_t i2o_config_read_lct(struct kobject *kobj, char *buf,
-				   loff_t offset, size_t count)
-{
-	struct i2o_controller *c = to_i2o_controller(container_of(kobj,
-								  struct device,
-								  kobj));
-	u32 size = c->lct->table_size * 4;
-
-	if(offset > size)
-		return 0;
-
-	if(offset + count > size)
-		count = size - offset;
-
-	memcpy(buf, (u8 *) c->lct + offset, count);
-
-	return count;
-};
-
-/* attribute for HRT in sysfs */
-static struct bin_attribute i2o_config_hrt_attr = {
-	.attr = {
-		.name = "hrt",
-		.mode = S_IRUGO,
-		.owner = THIS_MODULE
-	},
-	.size = 0,
-	.read = i2o_config_read_hrt
-};
-
-/* attribute for LCT in sysfs */
-static struct bin_attribute i2o_config_lct_attr = {
-	.attr = {
-		.name = "lct",
-		.mode = S_IRUGO,
-		.owner = THIS_MODULE
-	},
-	.size = 0,
-	.read = i2o_config_read_lct
-};
-
-/**
- *	i2o_config_notify_controller_add - Notify of added controller
- *	@c: the controller which was added
- *
- *	If a I2O controller is added, we catch the notification to add sysfs
- *	entries.
- */
-static void i2o_config_notify_controller_add(struct i2o_controller *c)
-{
-	sysfs_create_bin_file(&(c->device.kobj), &i2o_config_hrt_attr);
-	sysfs_create_bin_file(&(c->device.kobj), &i2o_config_lct_attr);
-};
-
-/**
- *	i2o_config_notify_controller_remove - Notify of removed controller
- *	@c: the controller which was removed
- *
- *	If a I2O controller is removed, we catch the notification to remove the
- *	sysfs entries.
- */
-static void i2o_config_notify_controller_remove(struct i2o_controller *c)
-{
-	sysfs_remove_bin_file(&c->device.kobj, &i2o_config_lct_attr);
-	sysfs_remove_bin_file(&c->device.kobj, &i2o_config_hrt_attr);
-};
-
-/* Config OSM driver struct */
-static struct i2o_driver i2o_config_driver = {
-	.name = OSM_NAME,
-	.notify_controller_add = i2o_config_notify_controller_add,
-	.notify_controller_remove = i2o_config_notify_controller_remove
-};
-
 static int i2o_cfg_getiops(unsigned long arg)
 {
 	struct i2o_controller *c;
@@ -1257,37 +1122,20 @@ static struct miscdevice i2o_miscdev = {
 	&config_fops
 };
 
-static int __init i2o_config_init(void)
+static int __init i2o_config_old_init(void)
 {
-	printk(KERN_INFO OSM_DESCRIPTION " v" OSM_VERSION "\n");
-
 	spin_lock_init(&i2o_config_lock);
 
 	if (misc_register(&i2o_miscdev) < 0) {
 		osm_err("can't register device.\n");
 		return -EBUSY;
 	}
-	/*
-	 *      Install our handler
-	 */
-	if (i2o_driver_register(&i2o_config_driver)) {
-		osm_err("handler register failed.\n");
-		misc_deregister(&i2o_miscdev);
-		return -EBUSY;
-	}
 	return 0;
 }
 
-static void i2o_config_exit(void)
+static void i2o_config_old_exit(void)
 {
 	misc_deregister(&i2o_miscdev);
-	i2o_driver_unregister(&i2o_config_driver);
 }
 
 MODULE_AUTHOR("Red Hat Software");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION(OSM_DESCRIPTION);
-MODULE_VERSION(OSM_VERSION);
-
-module_init(i2o_config_init);
-module_exit(i2o_config_exit);
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index b176d0eeff7f..e5b74452c495 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -228,7 +228,7 @@ static const char *i2o_get_class_name(int class)
 	case I2O_CLASS_FLOPPY_DEVICE:
 		idx = 12;
 		break;
-	case I2O_CLASS_BUS_ADAPTER_PORT:
+	case I2O_CLASS_BUS_ADAPTER:
 		idx = 13;
 		break;
 	case I2O_CLASS_PEER_TRANSPORT_AGENT:
@@ -490,7 +490,7 @@ static int i2o_seq_show_lct(struct seq_file *seq, void *v)
 				seq_printf(seq, ", Unknown Device Type");
 			break;
 
-		case I2O_CLASS_BUS_ADAPTER_PORT:
+		case I2O_CLASS_BUS_ADAPTER:
 			if (lct->lct_entry[i].sub_class < BUS_TABLE_SIZE)
 				seq_printf(seq, ", %s",
 					   bus_ports[lct->lct_entry[i].
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index 812c29ec86d3..c3b0c29ac02d 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -103,7 +103,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 	i2o_status_block *sb;
 
 	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER_PORT) {
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
 		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
 		   && (type == 0x01))	/* SCSI bus */
 			max_channel++;
@@ -139,7 +139,7 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 
 	i = 0;
 	list_for_each_entry(i2o_dev, &c->devices, list)
-	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER_PORT) {
+	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
 		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1) || (type == 1))	/* only SCSI bus */
 			i2o_shost->channel[i++] = i2o_dev;
 
@@ -186,6 +186,7 @@ static int i2o_scsi_remove(struct device *dev)
 
 	shost_for_each_device(scsi_dev, i2o_shost->scsi_host)
 	    if (scsi_dev->hostdata == i2o_dev) {
+		sysfs_remove_link(&i2o_dev->device.kobj, "scsi");
 		scsi_remove_device(scsi_dev);
 		scsi_device_put(scsi_dev);
 		break;
@@ -259,12 +260,14 @@ static int i2o_scsi_probe(struct device *dev)
 	scsi_dev =
 	    __scsi_add_device(i2o_shost->scsi_host, channel, id, lun, i2o_dev);
 
-	if (!scsi_dev) {
+	if (IS_ERR(scsi_dev)) {
 		osm_warn("can not add SCSI device %03x\n",
 			 i2o_dev->lct_data.tid);
-		return -EFAULT;
+		return PTR_ERR(scsi_dev);
 	}
 
+	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj, "scsi");
+
 	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
 		 i2o_dev->lct_data.tid, channel, id, (unsigned int)lun);
 
@@ -545,7 +548,13 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	int tid;
 	struct i2o_message __iomem *msg;
 	u32 m;
-	u32 scsi_flags, sg_flags;
+	/*
+	 * ENABLE_DISCONNECT
+	 * SIMPLE_TAG
+	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
+	 */
+	u32 scsi_flags = 0x20a00000;
+	u32 sg_flags;
 	u32 __iomem *mptr;
 	u32 __iomem *lenptr;
 	u32 len;
@@ -591,17 +600,19 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 
 	switch (SCpnt->sc_data_direction) {
 	case PCI_DMA_NONE:
-		scsi_flags = 0x00000000;	// DATA NO XFER
+		/* DATA NO XFER */
 		sg_flags = 0x00000000;
 		break;
 
 	case PCI_DMA_TODEVICE:
-		scsi_flags = 0x80000000;	// DATA OUT (iop-->dev)
+		/* DATA OUT (iop-->dev) */
+		scsi_flags |= 0x80000000;
 		sg_flags = 0x14000000;
 		break;
 
 	case PCI_DMA_FROMDEVICE:
-		scsi_flags = 0x40000000;	// DATA IN  (iop<--dev)
+		/* DATA IN  (iop<--dev) */
+		scsi_flags |= 0x40000000;
 		sg_flags = 0x10000000;
 		break;
 
@@ -639,8 +650,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	   }
 	 */
 
-	/* Direction, disconnect ok, tag, CDBLen */
-	writel(scsi_flags | 0x20200000 | SCpnt->cmd_len, mptr ++);
+	writel(scsi_flags | SCpnt->cmd_len, mptr++);
 
 	/* Write SCSI command into the message - always 16 byte block */
 	memcpy_toio(mptr, SCpnt->cmnd, 16);
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 62b0d8bed186..40312053b38d 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -455,6 +455,70 @@ static int i2o_iop_clear(struct i2o_controller *c)
 	return rc;
 }
 
+/**
+ *	i2o_iop_init_outbound_queue - setup the outbound message queue
+ *	@c: I2O controller
+ *
+ *	Clear and (re)initialize IOP's outbound queue and post the message
+ *	frames to the IOP.
+ *
+ *	Returns 0 on success or a negative errno code on failure.
+ */
+static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
+{
+	u8 *status = c->status.virt;
+	u32 m;
+	struct i2o_message __iomem *msg;
+	ulong timeout;
+	int i;
+
+	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
+
+	memset(status, 0, 4);
+
+	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	if (m == I2O_QUEUE_EMPTY)
+		return -ETIMEDOUT;
+
+	writel(EIGHT_WORD_MSG_SIZE | TRL_OFFSET_6, &msg->u.head[0]);
+	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
+	       &msg->u.head[1]);
+	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
+	writel(0x0106, &msg->u.s.tcntxt);	/* FIXME: why 0x0106, maybe in
+						   Spec? */
+	writel(PAGE_SIZE, &msg->body[0]);
+	/* Outbound msg frame size in words and Initcode */
+	writel(MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);
+	writel(0xd0000004, &msg->body[2]);
+	writel(i2o_dma_low(c->status.phys), &msg->body[3]);
+	writel(i2o_dma_high(c->status.phys), &msg->body[4]);
+
+	i2o_msg_post(c, m);
+
+	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
+	while (*status <= I2O_CMD_IN_PROGRESS) {
+		if (time_after(jiffies, timeout)) {
+			osm_warn("%s: Timeout Initializing\n", c->name);
+			return -ETIMEDOUT;
+		}
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+
+		rmb();
+	}
+
+	m = c->out_queue.phys;
+
+	/* Post frames */
+	for (i = 0; i < NMBR_MSG_FRAMES; i++) {
+		i2o_flush_reply(c, m);
+		udelay(1);	/* Promise */
+		m += MSG_FRAME_SIZE * 4;
+	}
+
+	return 0;
+}
+
 /**
  *	i2o_iop_reset - reset an I2O controller
  *	@c: controller to reset
@@ -491,25 +555,16 @@ static int i2o_iop_reset(struct i2o_controller *c)
 	writel(0, &msg->u.s.tcntxt);	//FIXME: use reasonable transaction context
 	writel(0, &msg->body[0]);
 	writel(0, &msg->body[1]);
-	writel(i2o_ptr_low((void *)c->status.phys), &msg->body[2]);
-	writel(i2o_ptr_high((void *)c->status.phys), &msg->body[3]);
+	writel(i2o_dma_low(c->status.phys), &msg->body[2]);
+	writel(i2o_dma_high(c->status.phys), &msg->body[3]);
 
 	i2o_msg_post(c, m);
 
 	/* Wait for a reply */
 	timeout = jiffies + I2O_TIMEOUT_RESET * HZ;
 	while (!*status) {
-		if (time_after(jiffies, timeout)) {
-			printk(KERN_ERR "%s: IOP reset timeout.\n", c->name);
-			rc = -ETIMEDOUT;
-			goto exit;
-		}
-
-		/* Promise bug */
-		if (status[1] || status[4]) {
-			*status = 0;
+		if (time_after(jiffies, timeout))
 			break;
-		}
 
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
@@ -517,14 +572,20 @@ static int i2o_iop_reset(struct i2o_controller *c)
 		rmb();
 	}
 
-	if (*status == I2O_CMD_IN_PROGRESS) {
+	switch (*status) {
+	case I2O_CMD_REJECTED:
+		osm_warn("%s: IOP reset rejected\n", c->name);
+		rc = -EPERM;
+		break;
+
+	case I2O_CMD_IN_PROGRESS:
 		/*
 		 * Once the reset is sent, the IOP goes into the INIT state
-		 * which is indeterminate.  We need to wait until the IOP
-		 * has rebooted before we can let the system talk to
-		 * it. We read the inbound Free_List until a message is
-		 * available. If we can't read one in the given ammount of
-		 * time, we assume the IOP could not reboot properly.
+		 * which is indeterminate. We need to wait until the IOP has
+		 * rebooted before we can let the system talk to it. We read
+		 * the inbound Free_List until a message is available. If we
+		 * can't read one in the given ammount of time, we assume the
+		 * IOP could not reboot properly.
 		 */
 		pr_debug("%s: Reset in progress, waiting for reboot...\n",
 			 c->name);
@@ -543,19 +604,26 @@ static int i2o_iop_reset(struct i2o_controller *c)
 			m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
 		}
 		i2o_msg_nop(c, m);
-	}
 
-	/* from here all quiesce commands are safe */
-	c->no_quiesce = 0;
+		/* from here all quiesce commands are safe */
+		c->no_quiesce = 0;
 
-	/* If IopReset was rejected or didn't perform reset, try IopClear */
-	i2o_status_get(c);
-	if (*status == I2O_CMD_REJECTED || sb->iop_state != ADAPTER_STATE_RESET) {
-		printk(KERN_WARNING "%s: Reset rejected, trying to clear\n",
-		       c->name);
-		i2o_iop_clear(c);
-	} else
-		pr_debug("%s: Reset completed.\n", c->name);
+		/* verify if controller is in state RESET */
+		i2o_status_get(c);
+
+		if (!c->promise && (sb->iop_state != ADAPTER_STATE_RESET))
+			osm_warn("%s: reset completed, but adapter not in RESET"
+				 " state.\n", c->name);
+		else
+			osm_debug("%s: reset completed.\n", c->name);
+
+		break;
+
+	default:
+		osm_err("%s: IOP reset timeout.\n", c->name);
+		rc = -ETIMEDOUT;
+		break;
+	}
 
       exit:
 	/* Enable all IOPs */
@@ -564,87 +632,6 @@ static int i2o_iop_reset(struct i2o_controller *c)
 	return rc;
 };
 
-/**
- *	i2o_iop_init_outbound_queue - setup the outbound message queue
- *	@c: I2O controller
- *
- *	Clear and (re)initialize IOP's outbound queue and post the message
- *	frames to the IOP.
- *
- *	Returns 0 on success or a negative errno code on failure.
- */
-static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
-{
-	u8 *status = c->status.virt;
-	u32 m;
-	struct i2o_message __iomem *msg;
-	ulong timeout;
-	int i;
-
-	pr_debug("%s: Initializing Outbound Queue...\n", c->name);
-
-	memset(status, 0, 4);
-
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-
-	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
-	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
-	       &msg->u.head[1]);
-	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0x00000000, &msg->u.s.tcntxt);
-	writel(PAGE_SIZE, &msg->body[0]);
-	writel(MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);	/* Outbound msg frame
-								   size in words and Initcode */
-	writel(0xd0000004, &msg->body[2]);
-	writel(i2o_ptr_low((void *)c->status.phys), &msg->body[3]);
-	writel(i2o_ptr_high((void *)c->status.phys), &msg->body[4]);
-
-	i2o_msg_post(c, m);
-
-	timeout = jiffies + I2O_TIMEOUT_INIT_OUTBOUND_QUEUE * HZ;
-	while (*status <= I2O_CMD_IN_PROGRESS) {
-		if (time_after(jiffies, timeout)) {
-			printk(KERN_WARNING "%s: Timeout Initializing\n",
-			       c->name);
-			return -ETIMEDOUT;
-		}
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(1);
-
-		rmb();
-	}
-
-	m = c->out_queue.phys;
-
-	/* Post frames */
-	for (i = 0; i < NMBR_MSG_FRAMES; i++) {
-		i2o_flush_reply(c, m);
-		udelay(1);	/* Promise */
-		m += MSG_FRAME_SIZE * 4;
-	}
-
-	return 0;
-}
-
-/**
- *	i2o_iop_send_nop - send a core NOP message
- *	@c: controller
- *
- *	Send a no-operation message with a reply set to cause no
- *	action either. Needed for bringing up promise controllers.
- */
-static int i2o_iop_send_nop(struct i2o_controller *c)
-{
-	struct i2o_message __iomem *msg;
-	u32 m = i2o_msg_get_wait(c, &msg, HZ);
-	if (m == I2O_QUEUE_EMPTY)
-		return -ETIMEDOUT;
-	i2o_msg_nop(c, m);
-	return 0;
-}
-
 /**
  *	i2o_iop_activate - Bring controller up to HOLD
  *	@c: controller
@@ -656,26 +643,9 @@ static int i2o_iop_send_nop(struct i2o_controller *c)
  */
 static int i2o_iop_activate(struct i2o_controller *c)
 {
-	struct pci_dev *i960 = NULL;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc;
-
-	if (c->promise) {
-		/* Beat up the hardware first of all */
-		i960 =
-		    pci_find_slot(c->pdev->bus->number,
-				  PCI_DEVFN(PCI_SLOT(c->pdev->devfn), 0));
-		if (i960)
-			pci_write_config_word(i960, 0x42, 0);
-
-		/* Follow this sequence precisely or the controller
-		   ceases to perform useful functions until reboot */
-		if ((rc = i2o_iop_send_nop(c)))
-			return rc;
-
-		if ((rc = i2o_iop_reset(c)))
-			return rc;
-	}
+	int state;
 
 	/* In INIT state, Wait Inbound Q to initialize (in i2o_status_get) */
 	/* In READY state, Get status */
@@ -684,7 +654,8 @@ static int i2o_iop_activate(struct i2o_controller *c)
 	if (rc) {
 		printk(KERN_INFO "%s: Unable to obtain status, "
 		       "attempting a reset.\n", c->name);
-		if (i2o_iop_reset(c))
+		rc = i2o_iop_reset(c);
+		if (rc)
 			return rc;
 	}
 
@@ -697,37 +668,37 @@ static int i2o_iop_activate(struct i2o_controller *c)
 	switch (sb->iop_state) {
 	case ADAPTER_STATE_FAULTED:
 		printk(KERN_CRIT "%s: hardware fault\n", c->name);
-		return -ENODEV;
+		return -EFAULT;
 
 	case ADAPTER_STATE_READY:
 	case ADAPTER_STATE_OPERATIONAL:
 	case ADAPTER_STATE_HOLD:
 	case ADAPTER_STATE_FAILED:
 		pr_debug("%s: already running, trying to reset...\n", c->name);
-		if (i2o_iop_reset(c))
-			return -ENODEV;
+		rc = i2o_iop_reset(c);
+		if (rc)
+			return rc;
 	}
 
+	/* preserve state */
+	state = sb->iop_state;
+
 	rc = i2o_iop_init_outbound_queue(c);
 	if (rc)
 		return rc;
 
-	if (c->promise) {
-		if ((rc = i2o_iop_send_nop(c)))
-			return rc;
+	/* if adapter was not in RESET state clear now */
+	if (state != ADAPTER_STATE_RESET)
+		i2o_iop_clear(c);
 
-		if ((rc = i2o_status_get(c)))
-			return rc;
+	i2o_status_get(c);
 
-		if (i960)
-			pci_write_config_word(i960, 0x42, 0x3FF);
+	if (sb->iop_state != ADAPTER_STATE_HOLD) {
+		osm_err("%s: failed to bring IOP into HOLD state\n", c->name);
+		return -EIO;
 	}
 
-	/* In HOLD state */
-
-	rc = i2o_hrt_get(c);
-
-	return rc;
+	return i2o_hrt_get(c);
 };
 
 /**
@@ -1030,8 +1001,8 @@ int i2o_status_get(struct i2o_controller *c)
 	writel(0, &msg->u.s.tcntxt);	// FIXME: use resonable transaction context
 	writel(0, &msg->body[0]);
 	writel(0, &msg->body[1]);
-	writel(i2o_ptr_low((void *)c->status_block.phys), &msg->body[2]);
-	writel(i2o_ptr_high((void *)c->status_block.phys), &msg->body[3]);
+	writel(i2o_dma_low(c->status_block.phys), &msg->body[2]);
+	writel(i2o_dma_high(c->status_block.phys), &msg->body[3]);
 	writel(sizeof(i2o_status_block), &msg->body[4]);	/* always 88 bytes */
 
 	i2o_msg_post(c, m);
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index f33fd81f77a4..a499af096a68 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -49,30 +49,6 @@ static struct pci_device_id __devinitdata i2o_pci_ids[] = {
 	{0}
 };
 
-/**
- *	i2o_dma_realloc - Realloc DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: pointer to a i2o_dma struct DMA buffer
- *	@len: new length of memory
- *	@gfp_mask: GFP mask
- *
- *	If there was something allocated in the addr, free it first. If len > 0
- *	than try to allocate it and write the addresses back to the addr
- *	structure. If len == 0 set the virtual address to NULL.
- *
- *	Returns the 0 on success or negative error code on failure.
- */
-int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr, size_t len,
-		    unsigned int gfp_mask)
-{
-	i2o_dma_free(dev, addr);
-
-	if (len)
-		return i2o_dma_alloc(dev, addr, len, gfp_mask);
-
-	return 0;
-};
-
 /**
  *	i2o_pci_free - Frees the DMA memory for the I2O controller
  *	@c: I2O controller to free
@@ -185,6 +161,7 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 	} else
 		c->in_queue = c->base;
 
+	c->irq_status = c->base.virt + I2O_IRQ_STATUS;
 	c->irq_mask = c->base.virt + I2O_IRQ_MASK;
 	c->in_port = c->base.virt + I2O_IN_PORT;
 	c->out_port = c->base.virt + I2O_OUT_PORT;
@@ -232,36 +209,30 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 static irqreturn_t i2o_pci_interrupt(int irq, void *dev_id, struct pt_regs *r)
 {
 	struct i2o_controller *c = dev_id;
-	struct device *dev = &c->pdev->dev;
-	u32 mv = readl(c->out_port);
-
-	/*
-	 * Old 960 steppings had a bug in the I2O unit that caused
-	 * the queue to appear empty when it wasn't.
-	 */
-	if (mv == I2O_QUEUE_EMPTY) {
-		mv = readl(c->out_port);
-		if (unlikely(mv == I2O_QUEUE_EMPTY))
-			return IRQ_NONE;
-		else
-			pr_debug("%s: 960 bug detected\n", c->name);
-	}
+	u32 m;
+	irqreturn_t rc = IRQ_NONE;
+
+	while (readl(c->irq_status) & I2O_IRQ_OUTBOUND_POST) {
+		m = readl(c->out_port);
+		if (m == I2O_QUEUE_EMPTY) {
+			/*
+			 * Old 960 steppings had a bug in the I2O unit that
+			 * caused the queue to appear empty when it wasn't.
+			 */
+			m = readl(c->out_port);
+			if (unlikely(m == I2O_QUEUE_EMPTY))
+				break;
+		}
 
-	while (mv != I2O_QUEUE_EMPTY) {
 		/* dispatch it */
-		if (i2o_driver_dispatch(c, mv))
+		if (i2o_driver_dispatch(c, m))
 			/* flush it if result != 0 */
-			i2o_flush_reply(c, mv);
+			i2o_flush_reply(c, m);
 
-		/*
-		 * That 960 bug again...
-		 */
-		mv = readl(c->out_port);
-		if (mv == I2O_QUEUE_EMPTY)
-			mv = readl(c->out_port);
+		rc = IRQ_HANDLED;
 	}
 
-	return IRQ_HANDLED;
+	return rc;
 }
 
 /**
diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index 3414325bdcfd..90c984ecd521 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -32,6 +32,10 @@ typedef unsigned int u32;
 
 #endif				/* __KERNEL__ */
 
+/*
+ *	Vendors
+ */
+#define I2O_VENDOR_DPT				0x001b
 
 /*
  * I2O Control IOCTLs and structures
@@ -333,7 +337,7 @@ typedef struct _i2o_status_block {
 #define I2O_CLASS_ATE_PERIPHERAL		0x061
 #define I2O_CLASS_FLOPPY_CONTROLLER		0x070
 #define I2O_CLASS_FLOPPY_DEVICE 		0x071
-#define I2O_CLASS_BUS_ADAPTER_PORT		0x080
+#define I2O_CLASS_BUS_ADAPTER			0x080
 #define I2O_CLASS_PEER_TRANSPORT_AGENT		0x090
 #define I2O_CLASS_PEER_TRANSPORT		0x091
 #define	I2O_CLASS_END				0xfff
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index e8cd11290010..497ea574f96b 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -157,7 +157,8 @@ struct i2o_controller {
 
 	void __iomem *in_port;	/* Inbout port address */
 	void __iomem *out_port;	/* Outbound port address */
-	void __iomem *irq_mask;		/* Interrupt register address */
+	void __iomem *irq_status;	/* Interrupt status register address */
+	void __iomem *irq_mask;	/* Interrupt mask register address */
 
 	/* Dynamic LCT related data */
 
@@ -242,15 +243,6 @@ extern int i2o_msg_post_wait_mem(struct i2o_controller *, u32, unsigned long,
 extern void i2o_msg_nop(struct i2o_controller *, u32);
 static inline void i2o_flush_reply(struct i2o_controller *, u32);
 
-/* DMA handling functions */
-static inline int i2o_dma_alloc(struct device *, struct i2o_dma *, size_t,
-				unsigned int);
-static inline void i2o_dma_free(struct device *, struct i2o_dma *);
-int i2o_dma_realloc(struct device *, struct i2o_dma *, size_t, unsigned int);
-
-static inline int i2o_dma_map(struct device *, struct i2o_dma *);
-static inline void i2o_dma_unmap(struct device *, struct i2o_dma *);
-
 /* IOP functions */
 extern int i2o_status_get(struct i2o_controller *);
 
@@ -275,6 +267,16 @@ static inline u32 i2o_ptr_high(void *ptr)
 {
 	return (u32) ((u64) ptr >> 32);
 };
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) (u64) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return (u32) ((u64) dma_addr >> 32);
+};
 #else
 static inline u32 i2o_cntxt_list_add(struct i2o_controller *c, void *ptr)
 {
@@ -305,8 +307,246 @@ static inline u32 i2o_ptr_high(void *ptr)
 {
 	return 0;
 };
+
+static inline u32 i2o_dma_low(dma_addr_t dma_addr)
+{
+	return (u32) dma_addr;
+};
+
+static inline u32 i2o_dma_high(dma_addr_t dma_addr)
+{
+	return 0;
+};
+#endif
+
+/**
+ *	i2o_sg_tablesize - Calculate the maximum number of elements in a SGL
+ *	@c: I2O controller for which the calculation should be done
+ *	@body_size: maximum body size used for message in 32-bit words.
+ *
+ *	Return the maximum number of SG elements in a SG list.
+ */
+static inline u16 i2o_sg_tablesize(struct i2o_controller *c, u16 body_size)
+{
+	i2o_status_block *sb = c->status_block.virt;
+	u16 sg_count =
+	    (sb->inbound_frame_size - sizeof(struct i2o_message) / 4) -
+	    body_size;
+
+	if (c->pae_support) {
+		/*
+		 * for 64-bit a SG attribute element must be added and each
+		 * SG element needs 12 bytes instead of 8.
+		 */
+		sg_count -= 2;
+		sg_count /= 3;
+	} else
+		sg_count /= 2;
+
+	if (c->short_req && (sg_count > 8))
+		sg_count = 8;
+
+	return sg_count;
+};
+
+/**
+ *	i2o_dma_map_single - Map pointer to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@ptr: pointer to the data which should be mapped
+ *	@size: size of data in bytes
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_single(). The pointer sg_ptr will only be set to the end of the
+ *	SG list if the allocation was successful.
+ *
+ *	Returns DMA address which must be checked for failures using
+ *	dma_mapping_error().
+ */
+static inline dma_addr_t i2o_dma_map_single(struct i2o_controller *c, void *ptr,
+					    size_t size,
+					    enum dma_data_direction direction,
+					    u32 __iomem ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 __iomem *mptr = *sg_ptr;
+	dma_addr_t dma_addr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0xd4000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0xd0000000;
+		break;
+	default:
+		return 0;
+	}
+
+	dma_addr = dma_map_single(&c->pdev->dev, ptr, size, direction);
+	if (!dma_mapping_error(dma_addr)) {
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+			writel(0x7C020002, mptr++);
+			writel(PAGE_SIZE, mptr++);
+		}
+#endif
+
+		writel(sg_flags | size, mptr++);
+		writel(i2o_dma_low(dma_addr), mptr++);
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			writel(i2o_dma_high(dma_addr), mptr++);
+#endif
+		*sg_ptr = mptr;
+	}
+	return dma_addr;
+};
+
+/**
+ *	i2o_dma_map_sg - Map a SG List to controller and fill in I2O message.
+ *	@c: I2O controller
+ *	@sg: SG list to be mapped
+ *	@sg_count: number of elements in the SG list
+ *	@direction: DMA_TO_DEVICE / DMA_FROM_DEVICE
+ *	@sg_ptr: pointer to the SG list inside the I2O message
+ *
+ *	This function does all necessary DMA handling and also writes the I2O
+ *	SGL elements into the I2O message. For details on DMA handling see also
+ *	dma_map_sg(). The pointer sg_ptr will only be set to the end of the SG
+ *	list if the allocation was successful.
+ *
+ *	Returns 0 on failure or 1 on success.
+ */
+static inline int i2o_dma_map_sg(struct i2o_controller *c,
+				 struct scatterlist *sg, int sg_count,
+				 enum dma_data_direction direction,
+				 u32 __iomem ** sg_ptr)
+{
+	u32 sg_flags;
+	u32 __iomem *mptr = *sg_ptr;
+
+	switch (direction) {
+	case DMA_TO_DEVICE:
+		sg_flags = 0x14000000;
+		break;
+	case DMA_FROM_DEVICE:
+		sg_flags = 0x10000000;
+		break;
+	default:
+		return 0;
+	}
+
+	sg_count = dma_map_sg(&c->pdev->dev, sg, sg_count, direction);
+	if (!sg_count)
+		return 0;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+	if ((sizeof(dma_addr_t) > 4) && c->pae_support) {
+		writel(0x7C020002, mptr++);
+		writel(PAGE_SIZE, mptr++);
+	}
 #endif
 
+	while (sg_count-- > 0) {
+		if (!sg_count)
+			sg_flags |= 0xC0000000;
+		writel(sg_flags | sg_dma_len(sg), mptr++);
+		writel(i2o_dma_low(sg_dma_address(sg)), mptr++);
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if ((sizeof(dma_addr_t) > 4) && c->pae_support)
+			writel(i2o_dma_high(sg_dma_address(sg)), mptr++);
+#endif
+		sg++;
+	}
+	*sg_ptr = mptr;
+
+	return 1;
+};
+
+/**
+ *	i2o_dma_alloc - Allocate DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which should get the DMA buffer
+ *	@len: length of the new DMA memory
+ *	@gfp_mask: GFP mask
+ *
+ *	Allocate a coherent DMA memory and write the pointers into addr.
+ *
+ *	Returns 0 on success or -ENOMEM on failure.
+ */
+static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
+				size_t len, unsigned int gfp_mask)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
+
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
+		dma_64 = 1;
+		if (pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+			return -ENOMEM;
+	}
+
+	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
+
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+
+	if (!addr->virt)
+		return -ENOMEM;
+
+	memset(addr->virt, 0, len);
+	addr->len = len;
+
+	return 0;
+};
+
+/**
+ *	i2o_dma_free - Free DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: i2o_dma struct which contains the DMA buffer
+ *
+ *	Free a coherent DMA memory and set virtual address of addr to NULL.
+ */
+static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
+{
+	if (addr->virt) {
+		if (addr->phys)
+			dma_free_coherent(dev, addr->len, addr->virt,
+					  addr->phys);
+		else
+			kfree(addr->virt);
+		addr->virt = NULL;
+	}
+};
+
+/**
+ *	i2o_dma_realloc - Realloc DMA memory
+ *	@dev: struct device pointer to the PCI device of the I2O controller
+ *	@addr: pointer to a i2o_dma struct DMA buffer
+ *	@len: new length of memory
+ *	@gfp_mask: GFP mask
+ *
+ *	If there was something allocated in the addr, free it first. If len > 0
+ *	than try to allocate it and write the addresses back to the addr
+ *	structure. If len == 0 set the virtual address to NULL.
+ *
+ *	Returns the 0 on success or negative error code on failure.
+ */
+static inline int i2o_dma_realloc(struct device *dev, struct i2o_dma *addr,
+				  size_t len, unsigned int gfp_mask)
+{
+	i2o_dma_free(dev, addr);
+
+	if (len)
+		return i2o_dma_alloc(dev, addr, len, gfp_mask);
+
+	return 0;
+};
+
 /* I2O driver (OSM) functions */
 extern int i2o_driver_register(struct i2o_driver *);
 extern void i2o_driver_unregister(struct i2o_driver *);
@@ -375,10 +615,11 @@ extern int i2o_device_claim_release(struct i2o_device *);
 /* Exec OSM functions */
 extern int i2o_exec_lct_get(struct i2o_controller *);
 
-/* device / driver conversion functions */
+/* device / driver / kobject conversion functions */
 #define to_i2o_driver(drv) container_of(drv,struct i2o_driver, driver)
 #define to_i2o_device(dev) container_of(dev, struct i2o_device, device)
 #define to_i2o_controller(dev) container_of(dev, struct i2o_controller, device)
+#define kobj_to_i2o_device(kobj) to_i2o_device(container_of(kobj, struct device, kobj))
 
 /**
  *	i2o_msg_get - obtain an I2O message from the IOP
@@ -466,8 +707,10 @@ static inline struct i2o_message __iomem *i2o_msg_out_to_virt(struct
 							      i2o_controller *c,
 							      u32 m)
 {
-	BUG_ON(m < c->out_queue.phys
-	       || m >= c->out_queue.phys + c->out_queue.len);
+	if (unlikely
+	    (m < c->out_queue.phys
+	     || m >= c->out_queue.phys + c->out_queue.len))
+		return NULL;
 
 	return c->out_queue.virt + (m - c->out_queue.phys);
 };
@@ -532,48 +775,6 @@ static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
 	}
 };
 
-/**
- *	i2o_dma_map - Map the memory to DMA
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should be mapped
- *
- *	Map the memory in addr->virt to coherent DMA memory and write the
- *	physical address into addr->phys.
- *
- *	Returns 0 on success or -ENOMEM on failure.
- */
-static inline int i2o_dma_map(struct device *dev, struct i2o_dma *addr)
-{
-	if (!addr->virt)
-		return -EFAULT;
-
-	if (!addr->phys)
-		addr->phys = dma_map_single(dev, addr->virt, addr->len,
-					    DMA_BIDIRECTIONAL);
-	if (!addr->phys)
-		return -ENOMEM;
-
-	return 0;
-};
-
-/**
- *	i2o_dma_unmap - Unmap the DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should be unmapped
- *
- *	Unmap the memory in addr->virt from DMA memory.
- */
-static inline void i2o_dma_unmap(struct device *dev, struct i2o_dma *addr)
-{
-	if (!addr->virt)
-		return;
-
-	if (addr->phys) {
-		dma_unmap_single(dev, addr->phys, addr->len, DMA_BIDIRECTIONAL);
-		addr->phys = 0;
-	}
-};
-
 /*
  *	Endian handling wrapped into the macro - keeps the core code
  *	cleaner.
@@ -725,6 +926,14 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define I2O_CMD_SCSI_ABORT		0x83
 #define I2O_CMD_SCSI_BUSRESET		0x27
 
+/*
+ * Bus Adapter Class
+ */
+#define I2O_CMD_BUS_ADAPTER_RESET	0x85
+#define I2O_CMD_BUS_RESET		0x87
+#define I2O_CMD_BUS_SCAN		0x89
+#define I2O_CMD_BUS_QUIESCE		0x8b
+
 /*
  * Random Block Storage Class
  */
@@ -948,7 +1157,7 @@ extern void i2o_debug_state(struct i2o_controller *c);
 
 /* request queue sizes */
 #define I2O_MAX_SECTORS			1024
-#define I2O_MAX_SEGMENTS		128
+#define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
 #define I2O_REQ_MEMPOOL_SIZE		32
 
-- 
cgit v1.2.3-59-g8ed1b


From b2aaee33fbb354a2f08121aa1c1be55841102761 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:19 -0700
Subject: [PATCH] I2O: Adaptec specific SG_IO access, firmware access through
 sysfs and 2400A workaround

Changes:
 - Provide SG_IO access to BLOCK and EXECUTIVE class on Adaptec
   controllers
 - Use PRIVATE messages in SCSI-OSM because on some controllers normal
   SCSI class commands like READ or READ CAPACITY cause errors
 - Use new DMA and SG list creation function
 - Added workaround to limit sectors per request for Adaptec 2400A
   controllers

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/Kconfig      |  18 +++
 drivers/message/i2o/i2o_block.h  |   6 +
 drivers/message/i2o/i2o_config.c |   4 +
 drivers/message/i2o/i2o_scsi.c   | 263 +++++++++++++++++++++++----------------
 drivers/message/i2o/pci.c        |  22 ++++
 include/linux/i2o-dev.h          |  22 ++++
 include/linux/i2o.h              |  37 ++++--
 include/scsi/sg_request.h        |  26 ++++
 8 files changed, 282 insertions(+), 116 deletions(-)
 create mode 100644 include/scsi/sg_request.h

(limited to 'include/linux')

diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index ce278e060aca..94b6d676c5cb 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -24,6 +24,24 @@ config I2O
 
 	  If unsure, say N.
 
+config I2O_EXT_ADAPTEC
+	bool "Enable Adaptec extensions"
+	depends on I2O
+	default y
+	---help---
+	  Say Y for support of raidutils for Adaptec I2O controllers. You also
+	  have to say Y to "I2O Configuration support", "I2O SCSI OSM" below
+	  and to "SCSI generic support" under "SCSI device configuration".
+
+config I2O_EXT_ADAPTEC_DMA64
+	bool "Enable 64-bit DMA"
+	depends on I2O_EXT_ADAPTEC && ( 64BIT || HIGHMEM64G )
+	default y
+	---help---
+	  Say Y for support of 64-bit DMA transfer mode on Adaptec I2O
+	  controllers.
+	  Note: You need at least firmware version 3709.
+
 config I2O_CONFIG
 	tristate "I2O Configuration support"
 	depends on PCI && I2O
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
index 9e1a95fb0833..e45cc40ce384 100644
--- a/drivers/message/i2o/i2o_block.h
+++ b/drivers/message/i2o/i2o_block.h
@@ -56,6 +56,12 @@
 #define I2O_BLOCK_RETRY_TIME HZ/4
 #define I2O_BLOCK_MAX_OPEN_REQUESTS 50
 
+/* request queue sizes */
+#define I2O_BLOCK_REQ_MEMPOOL_SIZE		32
+
+#define KERNEL_SECTOR_SHIFT 9
+#define KERNEL_SECTOR_SIZE (1 << KERNEL_SECTOR_SHIFT)
+
 /* I2O Block OSM mempool struct */
 struct i2o_block_mempool {
 	kmem_cache_t	*slab;
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 849d90aad779..7636833b4623 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -515,6 +515,7 @@ static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
 	return 0;
 }
 
+#ifdef CONFIG_I2O_EXT_ADAPTEC
 #ifdef CONFIG_COMPAT
 static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long arg)
 {
@@ -964,6 +965,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 	kfree(reply);
 	return rcode;
 }
+#endif
 
 /*
  * IOCTL Handler
@@ -1018,9 +1020,11 @@ static int i2o_cfg_ioctl(struct inode *inode, struct file *fp, unsigned int cmd,
 		ret = i2o_cfg_evt_get(arg, fp);
 		break;
 
+#ifdef CONFIG_I2O_EXT_ADAPTEC
 	case I2OPASSTHRU:
 		ret = i2o_cfg_passthru(arg);
 		break;
+#endif
 
 	default:
 		osm_debug("unknown ioctl called!\n");
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index c3b0c29ac02d..fef53b509a61 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -55,6 +55,7 @@
 #include <linux/pci.h>
 #include <linux/blkdev.h>
 #include <linux/i2o.h>
+#include <linux/scatterlist.h>
 
 #include <asm/dma.h>
 #include <asm/system.h>
@@ -65,19 +66,23 @@
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_request.h>
+#include <scsi/sg.h>
+#include <scsi/sg_request.h>
 
 #define OSM_NAME	"scsi-osm"
-#define OSM_VERSION	"$Rev$"
+#define OSM_VERSION	"1.282"
 #define OSM_DESCRIPTION	"I2O SCSI Peripheral OSM"
 
 static struct i2o_driver i2o_scsi_driver;
 
-static int i2o_scsi_max_id = 16;
-static int i2o_scsi_max_lun = 8;
+static unsigned int i2o_scsi_max_id = 16;
+static unsigned int i2o_scsi_max_lun = 255;
 
 struct i2o_scsi_host {
 	struct Scsi_Host *scsi_host;	/* pointer to the SCSI host */
 	struct i2o_controller *iop;	/* pointer to the I2O controller */
+	unsigned int lun;	/* lun's used for block devices */
 	struct i2o_device *channel[0];	/* channel->i2o_dev mapping table */
 };
 
@@ -100,12 +105,17 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 	u8 type;
 	int i;
 	size_t size;
-	i2o_status_block *sb;
+	u16 body_size = 6;
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec)
+		body_size = 8;
+#endif
 
 	list_for_each_entry(i2o_dev, &c->devices, list)
 	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
 		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
-		   && (type == 0x01))	/* SCSI bus */
+		    && (type == 0x01))	/* SCSI bus */
 			max_channel++;
 	}
 
@@ -127,20 +137,18 @@ static struct i2o_scsi_host *i2o_scsi_host_alloc(struct i2o_controller *c)
 	scsi_host->max_id = i2o_scsi_max_id;
 	scsi_host->max_lun = i2o_scsi_max_lun;
 	scsi_host->this_id = c->unit;
-
-	sb = c->status_block.virt;
-
-	scsi_host->sg_tablesize = (sb->inbound_frame_size -
-				   sizeof(struct i2o_message) / 4 - 6) / 2;
+	scsi_host->sg_tablesize = i2o_sg_tablesize(c, body_size);
 
 	i2o_shost = (struct i2o_scsi_host *)scsi_host->hostdata;
 	i2o_shost->scsi_host = scsi_host;
 	i2o_shost->iop = c;
+	i2o_shost->lun = 1;
 
 	i = 0;
 	list_for_each_entry(i2o_dev, &c->devices, list)
 	    if (i2o_dev->lct_data.class_id == I2O_CLASS_BUS_ADAPTER) {
-		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1) || (type == 1))	/* only SCSI bus */
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 0, &type, 1)
+		    && (type == 0x01))	/* only SCSI bus */
 			i2o_shost->channel[i++] = i2o_dev;
 
 		if (i >= max_channel)
@@ -212,8 +220,8 @@ static int i2o_scsi_probe(struct device *dev)
 	struct Scsi_Host *scsi_host;
 	struct i2o_device *parent;
 	struct scsi_device *scsi_dev;
-	u32 id;
-	u64 lun;
+	u32 id = -1;
+	u64 lun = -1;
 	int channel = -1;
 	int i;
 
@@ -223,8 +231,56 @@ static int i2o_scsi_probe(struct device *dev)
 
 	scsi_host = i2o_shost->scsi_host;
 
-	if (i2o_parm_field_get(i2o_dev, 0, 3, &id, 4) < 0)
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+	case I2O_CLASS_EXECUTIVE:
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+		if (c->adaptec) {
+			u8 type;
+			struct i2o_device *d = i2o_shost->channel[0];
+
+			if (i2o_parm_field_get(d, 0x0000, 0, &type, 1)
+			    && (type == 0x01))	/* SCSI bus */
+				if (i2o_parm_field_get(d, 0x0200, 4, &id, 4)) {
+					channel = 0;
+					if (i2o_dev->lct_data.class_id ==
+					    I2O_CLASS_RANDOM_BLOCK_STORAGE)
+						lun = i2o_shost->lun++;
+					else
+						lun = 0;
+				}
+		}
+#endif
+		break;
+
+	case I2O_CLASS_SCSI_PERIPHERAL:
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 3, &id, 4) < 0)
+			return -EFAULT;
+
+		if (i2o_parm_field_get(i2o_dev, 0x0000, 4, &lun, 8) < 0)
+			return -EFAULT;
+
+		parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
+		if (!parent) {
+			osm_warn("can not find parent of device %03x\n",
+				 i2o_dev->lct_data.tid);
+			return -EFAULT;
+		}
+
+		for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
+			if (i2o_shost->channel[i] == parent)
+				channel = i;
+		break;
+
+	default:
+		return -EFAULT;
+	}
+
+	if (channel == -1) {
+		osm_warn("can not find channel of device %03x\n",
+			 i2o_dev->lct_data.tid);
 		return -EFAULT;
+	}
 
 	if (id >= scsi_host->max_id) {
 		osm_warn("SCSI device id (%d) >= max_id of I2O host (%d)", id,
@@ -232,31 +288,12 @@ static int i2o_scsi_probe(struct device *dev)
 		return -EFAULT;
 	}
 
-	if (i2o_parm_field_get(i2o_dev, 0, 4, &lun, 8) < 0)
-		return -EFAULT;
 	if (lun >= scsi_host->max_lun) {
 		osm_warn("SCSI device id (%d) >= max_lun of I2O host (%d)",
 			 (unsigned int)lun, scsi_host->max_lun);
 		return -EFAULT;
 	}
 
-	parent = i2o_iop_find_device(c, i2o_dev->lct_data.parent_tid);
-	if (!parent) {
-		osm_warn("can not find parent of device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return -EFAULT;
-	}
-
-	for (i = 0; i <= i2o_shost->scsi_host->max_channel; i++)
-		if (i2o_shost->channel[i] == parent)
-			channel = i;
-
-	if (channel == -1) {
-		osm_warn("can not find channel of device %03x\n",
-			 i2o_dev->lct_data.tid);
-		return -EFAULT;
-	}
-
 	scsi_dev =
 	    __scsi_add_device(i2o_shost->scsi_host, channel, id, lun, i2o_dev);
 
@@ -266,7 +303,8 @@ static int i2o_scsi_probe(struct device *dev)
 		return PTR_ERR(scsi_dev);
 	}
 
-	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj, "scsi");
+	sysfs_create_link(&i2o_dev->device.kobj, &scsi_dev->sdev_gendev.kobj,
+			  "scsi");
 
 	osm_info("device added (TID: %03x) channel: %d, id: %d, lun: %d\n",
 		 i2o_dev->lct_data.tid, channel, id, (unsigned int)lun);
@@ -542,9 +580,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 				 void (*done) (struct scsi_cmnd *))
 {
 	struct i2o_controller *c;
-	struct Scsi_Host *host;
 	struct i2o_device *i2o_dev;
-	struct device *dev;
 	int tid;
 	struct i2o_message __iomem *msg;
 	u32 m;
@@ -554,20 +590,16 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 * RETURN_SENSE_DATA_IN_REPLY_MESSAGE_FRAME
 	 */
 	u32 scsi_flags = 0x20a00000;
-	u32 sg_flags;
+	u32 sgl_offset;
 	u32 __iomem *mptr;
-	u32 __iomem *lenptr;
-	u32 len;
-	int i;
+	u32 cmd = I2O_CMD_SCSI_EXEC << 24;
+	int rc = 0;
 
 	/*
 	 *      Do the incoming paperwork
 	 */
-
 	i2o_dev = SCpnt->device->hostdata;
-	host = SCpnt->device->host;
 	c = i2o_dev->iop;
-	dev = &c->pdev->dev;
 
 	SCpnt->scsi_done = done;
 
@@ -575,7 +607,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 		osm_warn("no I2O device in request\n");
 		SCpnt->result = DID_NO_CONNECT << 16;
 		done(SCpnt);
-		return 0;
+		goto exit;
 	}
 
 	tid = i2o_dev->lct_data.tid;
@@ -583,47 +615,86 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	osm_debug("qcmd: Tid = %03x\n", tid);
 	osm_debug("Real scsi messages.\n");
 
-	/*
-	 *      Obtain an I2O message. If there are none free then
-	 *      throw it back to the scsi layer
-	 */
-
-	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
-	if (m == I2O_QUEUE_EMPTY)
-		return SCSI_MLQUEUE_HOST_BUSY;
-
-	mptr = &msg->body[0];
-
 	/*
 	 *      Put together a scsi execscb message
 	 */
-
 	switch (SCpnt->sc_data_direction) {
 	case PCI_DMA_NONE:
 		/* DATA NO XFER */
-		sg_flags = 0x00000000;
+		sgl_offset = SGL_OFFSET_0;
 		break;
 
 	case PCI_DMA_TODEVICE:
 		/* DATA OUT (iop-->dev) */
 		scsi_flags |= 0x80000000;
-		sg_flags = 0x14000000;
+		sgl_offset = SGL_OFFSET_10;
 		break;
 
 	case PCI_DMA_FROMDEVICE:
 		/* DATA IN  (iop<--dev) */
 		scsi_flags |= 0x40000000;
-		sg_flags = 0x10000000;
+		sgl_offset = SGL_OFFSET_10;
 		break;
 
 	default:
 		/* Unknown - kill the command */
 		SCpnt->result = DID_NO_CONNECT << 16;
 		done(SCpnt);
-		return 0;
+		goto exit;
 	}
 
-	writel(I2O_CMD_SCSI_EXEC << 24 | HOST_TID << 12 | tid, &msg->u.head[1]);
+	/*
+	 *      Obtain an I2O message. If there are none free then
+	 *      throw it back to the scsi layer
+	 */
+
+	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
+	if (m == I2O_QUEUE_EMPTY) {
+		rc = SCSI_MLQUEUE_HOST_BUSY;
+		goto exit;
+	}
+
+	mptr = &msg->body[0];
+
+#ifdef CONFIG_I2O_EXT_ADAPTEC
+	if (c->adaptec) {
+		u32 adpt_flags = 0;
+
+		if (SCpnt->sc_request && SCpnt->sc_request->upper_private_data) {
+			i2o_sg_io_hdr_t __user *usr_ptr =
+			    ((Sg_request *) (SCpnt->sc_request->
+					     upper_private_data))->header.
+			    usr_ptr;
+
+			if (usr_ptr)
+				get_user(adpt_flags, &usr_ptr->flags);
+		}
+
+		switch (i2o_dev->lct_data.class_id) {
+		case I2O_CLASS_EXECUTIVE:
+		case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+			/* interpret flag has to be set for executive */
+			adpt_flags ^= I2O_DPT_SG_FLAG_INTERPRET;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * for Adaptec controllers we use the PRIVATE command, because
+		 * the normal SCSI EXEC doesn't support all SCSI commands on
+		 * all controllers (for example READ CAPACITY).
+		 */
+		if (sgl_offset == SGL_OFFSET_10)
+			sgl_offset = SGL_OFFSET_12;
+		cmd = I2O_CMD_PRIVATE << 24;
+		writel(I2O_VENDOR_DPT << 16 | I2O_CMD_SCSI_EXEC, mptr++);
+		writel(adpt_flags | tid, mptr++);
+	}
+#endif
+
+	writel(cmd | HOST_TID << 12 | tid, &msg->u.head[1]);
 	writel(i2o_scsi_driver.context, &msg->u.s.icntxt);
 
 	/* We want the SCSI control block back */
@@ -655,55 +726,30 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	/* Write SCSI command into the message - always 16 byte block */
 	memcpy_toio(mptr, SCpnt->cmnd, 16);
 	mptr += 4;
-	lenptr = mptr++;	/* Remember me - fill in when we know */
-
-	/* Now fill in the SGList and command */
-	if (SCpnt->use_sg) {
-		struct scatterlist *sg;
-		int sg_count;
-
-		sg = SCpnt->request_buffer;
-		len = 0;
 
-		sg_count = dma_map_sg(dev, sg, SCpnt->use_sg,
-				      SCpnt->sc_data_direction);
-
-		if (unlikely(sg_count <= 0))
-			return -ENOMEM;
-
-		for (i = SCpnt->use_sg; i > 0; i--) {
-			if (i == 1)
-				sg_flags |= 0xC0000000;
-			writel(sg_flags | sg_dma_len(sg), mptr++);
-			writel(sg_dma_address(sg), mptr++);
-			len += sg_dma_len(sg);
-			sg++;
-		}
-
-		writel(len, lenptr);
-	} else {
-		len = SCpnt->request_bufflen;
-
-		writel(len, lenptr);
-
-		if (len > 0) {
-			dma_addr_t dma_addr;
-
-			dma_addr = dma_map_single(dev, SCpnt->request_buffer,
-						  SCpnt->request_bufflen,
-						  SCpnt->sc_data_direction);
-			if (!dma_addr)
-				return -ENOMEM;
-
-			SCpnt->SCp.ptr = (void *)(unsigned long)dma_addr;
-			sg_flags |= 0xC0000000;
-			writel(sg_flags | SCpnt->request_bufflen, mptr++);
-			writel(dma_addr, mptr++);
+	if (sgl_offset != SGL_OFFSET_0) {
+		/* write size of data addressed by SGL */
+		writel(SCpnt->request_bufflen, mptr++);
+
+		/* Now fill in the SGList and command */
+		if (SCpnt->use_sg) {
+			if (!i2o_dma_map_sg(c, SCpnt->request_buffer,
+					    SCpnt->use_sg,
+					    SCpnt->sc_data_direction, &mptr))
+				goto nomem;
+		} else {
+			SCpnt->SCp.dma_handle =
+			    i2o_dma_map_single(c, SCpnt->request_buffer,
+					       SCpnt->request_bufflen,
+					       SCpnt->sc_data_direction, &mptr);
+			if (dma_mapping_error(SCpnt->SCp.dma_handle))
+				goto nomem;
 		}
 	}
 
 	/* Stick the headers on */
-	writel((mptr - &msg->u.head[0]) << 16 | SGL_OFFSET_10, &msg->u.head[0]);
+	writel(I2O_MESSAGE_SIZE(mptr - &msg->u.head[0]) | sgl_offset,
+	       &msg->u.head[0]);
 
 	/* Queue the message */
 	i2o_msg_post(c, m);
@@ -711,6 +757,13 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	osm_debug("Issued %ld\n", SCpnt->serial_number);
 
 	return 0;
+
+      nomem:
+	rc = -ENOMEM;
+	i2o_msg_nop(c, m);
+
+      exit:
+	return rc;
 };
 
 /**
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index a499af096a68..964fe481849e 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -362,11 +362,33 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 		c->promise = 1;
 	}
 
+	if (pdev->subsystem_vendor == PCI_VENDOR_ID_DPT)
+		c->adaptec = 1;
+
 	/* Cards that go bananas if you quiesce them before you reset them. */
 	if (pdev->vendor == PCI_VENDOR_ID_DPT) {
 		c->no_quiesce = 1;
 		if (pdev->device == 0xa511)
 			c->raptor = 1;
+
+		if (pdev->subsystem_device == 0xc05a) {
+			c->limit_sectors = 1;
+			printk(KERN_INFO
+			       "%s: limit sectors per request to %d\n", c->name,
+			       I2O_MAX_SECTORS_LIMITED);
+		}
+#ifdef CONFIG_I2O_EXT_ADAPTEC_DMA64
+		if (sizeof(dma_addr_t) > 4) {
+			if (pci_set_dma_mask(pdev, DMA_64BIT_MASK))
+				printk(KERN_INFO "%s: 64-bit DMA unavailable\n",
+				       c->name);
+			else {
+				c->pae_support = 1;
+				printk(KERN_INFO "%s: using 64-bit DMA\n",
+				       c->name);
+			}
+		}
+#endif
 	}
 
 	if ((rc = i2o_pci_alloc(c))) {
diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index 90c984ecd521..d4a08d29e36d 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -32,6 +32,13 @@ typedef unsigned int u32;
 
 #endif				/* __KERNEL__ */
 
+/*
+ *	Software module types
+ */
+#define I2O_SOFTWARE_MODULE_IRTOS		0x11
+#define I2O_SOFTWARE_MODULE_IOP_PRIVATE		0x22
+#define I2O_SOFTWARE_MODULE_IOP_CONFIG		0x23
+
 /*
  *	Vendors
  */
@@ -125,6 +132,10 @@ struct i2o_evt_get {
 	int lost;
 };
 
+typedef struct i2o_sg_io_hdr {
+	unsigned int flags;	/* see I2O_DPT_SG_IO_FLAGS */
+} i2o_sg_io_hdr_t;
+
 /**************************************************************************
  * HRT related constants and structures
  **************************************************************************/
@@ -403,4 +414,15 @@ typedef struct _i2o_status_block {
 #define ADAPTER_STATE_FAILED			0x10
 #define ADAPTER_STATE_FAULTED			0x11
 
+
+/*
+ * DPT / Adaptec specific values for i2o_sg_io_hdr flags.
+ */
+#define I2O_DPT_SG_FLAG_INTERPRET		0x00010000
+#define I2O_DPT_SG_FLAG_PHYSICAL		0x00020000
+
+#define I2O_DPT_FLASH_FRAG_SIZE			0x10000
+#define I2O_DPT_FLASH_READ			0x0101
+#define I2O_DPT_FLASH_WRITE			0x0102
+
 #endif				/* _I2O_DEV_H */
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 497ea574f96b..2039a87c2b91 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -147,10 +147,13 @@ struct i2o_controller {
 
 	struct pci_dev *pdev;	/* PCI device */
 
-	unsigned int short_req:1;	/* use small block sizes */
-	unsigned int no_quiesce:1;	/* dont quiesce before reset */
-	unsigned int raptor:1;		/* split bar */
 	unsigned int promise:1;		/* Promise controller */
+	unsigned int adaptec:1;		/* DPT / Adaptec controller */
+	unsigned int raptor:1;	/* split bar */
+	unsigned int no_quiesce:1;	/* dont quiesce before reset */
+	unsigned int short_req:1;	/* use small block sizes */
+	unsigned int limit_sectors:1;	/* limit number of sectors / request */
+	unsigned int pae_support:1;	/* controller has 64-bit SGL support */
 
 	struct list_head devices;	/* list of I2O devices */
 	struct list_head list;	/* Controller list */
@@ -746,7 +749,21 @@ static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct i2o_controll
 static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
 				size_t len, unsigned int gfp_mask)
 {
+	struct pci_dev *pdev = to_pci_dev(dev);
+	int dma_64 = 0;
+
+	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
+			dma_64 = 1;
+			if(pci_set_dma_mask(pdev, DMA_32BIT_MASK))
+				return -ENOMEM;
+	}
+
 	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
+
+	if ((sizeof(dma_addr_t) > 4) && dma_64)
+		if(pci_set_dma_mask(pdev, DMA_64BIT_MASK))
+			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
+
 	if (!addr->virt)
 		return -ENOMEM;
 
@@ -946,7 +963,7 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define I2O_CMD_BLOCK_MEJECT		0x43
 #define I2O_CMD_BLOCK_POWER		0x70
 
-#define I2O_PRIVATE_MSG			0xFF
+#define I2O_CMD_PRIVATE			0xFF
 
 /* Command status values  */
 
@@ -1095,9 +1112,9 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define SGL_OFFSET_8    (0x0080 | I2OVERSION)
 #define SGL_OFFSET_9    (0x0090 | I2OVERSION)
 #define SGL_OFFSET_10   (0x00A0 | I2OVERSION)
-
-#define TRL_OFFSET_5    (0x0050 | I2OVERSION)
-#define TRL_OFFSET_6    (0x0060 | I2OVERSION)
+#define SGL_OFFSET_11   (0x00B0 | I2OVERSION)
+#define SGL_OFFSET_12   (0x00C0 | I2OVERSION)
+#define SGL_OFFSET(x)   (((x)<<4) | I2OVERSION)
 
 /* Transaction Reply Lists (TRL) Control Word structure */
 #define TRL_SINGLE_FIXED_LENGTH		0x00
@@ -1130,7 +1147,6 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define HOST_TID		1
 
 #define MSG_FRAME_SIZE		128	/* i2o_scsi assumes >= 32 */
-#define REPLY_FRAME_SIZE	17
 #define SG_TABLESIZE		30
 #define NMBR_MSG_FRAMES		128
 
@@ -1155,11 +1171,10 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define I2O_HRT_GET_TRIES		3
 #define I2O_LCT_GET_TRIES		3
 
-/* request queue sizes */
+/* defines for max_sectors and max_phys_segments */
 #define I2O_MAX_SECTORS			1024
+#define I2O_MAX_SECTORS_LIMITED		256
 #define I2O_MAX_PHYS_SEGMENTS		MAX_PHYS_SEGMENTS
 
-#define I2O_REQ_MEMPOOL_SIZE		32
-
 #endif				/* __KERNEL__ */
 #endif				/* _I2O_H */
diff --git a/include/scsi/sg_request.h b/include/scsi/sg_request.h
new file mode 100644
index 000000000000..57ff525bdd3b
--- /dev/null
+++ b/include/scsi/sg_request.h
@@ -0,0 +1,26 @@
+typedef struct scsi_request Scsi_Request;
+
+static Scsi_Request *dummy_cmdp;	/* only used for sizeof */
+
+typedef struct sg_scatter_hold { /* holding area for scsi scatter gather info */
+	unsigned short k_use_sg; /* Count of kernel scatter-gather pieces */
+	unsigned short sglist_len; /* size of malloc'd scatter-gather list ++ */
+	unsigned bufflen;	/* Size of (aggregate) data buffer */
+	unsigned b_malloc_len;	/* actual len malloc'ed in buffer */
+	void *buffer;		/* Data buffer or scatter list (k_use_sg>0) */
+	char dio_in_use;	/* 0->indirect IO (or mmap), 1->dio */
+	unsigned char cmd_opcode; /* first byte of command */
+} Sg_scatter_hold;
+
+typedef struct sg_request {	/* SG_MAX_QUEUE requests outstanding per file */
+	Scsi_Request *my_cmdp;	/* != 0  when request with lower levels */
+	struct sg_request *nextrp;	/* NULL -> tail request (slist) */
+	struct sg_fd *parentfp;	/* NULL -> not in use */
+	Sg_scatter_hold data;	/* hold buffer, perhaps scatter list */
+	sg_io_hdr_t header;	/* scsi command+info, see <scsi/sg.h> */
+	unsigned char sense_b[sizeof (dummy_cmdp->sr_sense_buffer)];
+	char res_used;		/* 1 -> using reserve buffer, 0 -> not ... */
+	char orphan;		/* 1 -> drop on sight, 0 -> normal */
+	char sg_io_owned;	/* 1 -> packet belongs to SG_IO */
+	volatile char done;	/* 0->before bh, 1->before read, 2->read */
+} Sg_request;
-- 
cgit v1.2.3-59-g8ed1b


From 9e87545f06930c1d294423a8091d1077e7444a47 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:21 -0700
Subject: [PATCH] I2O: second code cleanup of sparse warnings and unneeded
 syncronization

Changes:
 - Added header "core.h" for i2o_core.ko internal definitions
 - More sparse fixes
 - Changed display of TID's in sysfs attributes from XXX to 0xXXX
 - Use the right functions for accessing I/O and normal memory
 - Removed error handling of SCSI device errors and let the SCSI layer
   take care of it
 - Added new device / removed device handling to SCSI-OSM
 - Make status access volatile
 - Cleaned up activation of I2O controller
 - Removed unnecessary wmb() and rmb() calls
 - Use own struct i2o_io for I/O memory instead of struct i2o_dma

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/core.h       |  55 +++++++++++
 drivers/message/i2o/debug.c      |   3 -
 drivers/message/i2o/device.c     |  22 +++--
 drivers/message/i2o/driver.c     |  24 ++---
 drivers/message/i2o/exec-osm.c   |  27 +++---
 drivers/message/i2o/i2o_block.c  |   4 +-
 drivers/message/i2o/i2o_config.c |   8 +-
 drivers/message/i2o/i2o_scsi.c   | 202 +++++++++++++--------------------------
 drivers/message/i2o/iop.c        | 128 +++++++++++++------------
 drivers/message/i2o/pci.c        |  21 +---
 include/linux/i2o-dev.h          |  23 +++--
 include/linux/i2o.h              | 116 ++++++----------------
 12 files changed, 275 insertions(+), 358 deletions(-)
 create mode 100644 drivers/message/i2o/core.h

(limited to 'include/linux')

diff --git a/drivers/message/i2o/core.h b/drivers/message/i2o/core.h
new file mode 100644
index 000000000000..49851cccc48d
--- /dev/null
+++ b/drivers/message/i2o/core.h
@@ -0,0 +1,55 @@
+/*
+ *	I2O core internal declarations
+ *
+ *	Copyright (C) 2005	Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *
+ *	This program is free software; you can redistribute it and/or modify it
+ *	under the terms of the GNU General Public License as published by the
+ *	Free Software Foundation; either version 2 of the License, or (at your
+ *	option) any later version.
+ *
+ *	Fixes/additions:
+ *		Markus Lidel <Markus.Lidel@shadowconnect.com>
+ *			initial version.
+ */
+
+/* Exec-OSM */
+extern struct bus_type i2o_bus_type;
+
+extern struct i2o_driver i2o_exec_driver;
+extern int i2o_exec_lct_get(struct i2o_controller *);
+
+extern int __init i2o_exec_init(void);
+extern void __exit i2o_exec_exit(void);
+
+/* driver */
+extern int i2o_driver_dispatch(struct i2o_controller *, u32);
+
+extern int __init i2o_driver_init(void);
+extern void __exit i2o_driver_exit(void);
+
+/* PCI */
+extern int __init i2o_pci_init(void);
+extern void __exit i2o_pci_exit(void);
+
+/* device */
+extern void i2o_device_remove(struct i2o_device *);
+extern int i2o_device_parse_lct(struct i2o_controller *);
+
+extern int i2o_device_init(void);
+extern void i2o_device_exit(void);
+
+/* IOP */
+extern struct i2o_controller *i2o_iop_alloc(void);
+extern void i2o_iop_free(struct i2o_controller *);
+
+extern int i2o_iop_add(struct i2o_controller *);
+extern void i2o_iop_remove(struct i2o_controller *);
+
+/* control registers relative to c->base */
+#define I2O_IRQ_STATUS	0x30
+#define I2O_IRQ_MASK	0x34
+#define I2O_IN_PORT	0x40
+#define I2O_OUT_PORT	0x44
+
+#define I2O_IRQ_OUTBOUND_POST	0x00000008
diff --git a/drivers/message/i2o/debug.c b/drivers/message/i2o/debug.c
index 2a5d478fc60e..018ca887ca85 100644
--- a/drivers/message/i2o/debug.c
+++ b/drivers/message/i2o/debug.c
@@ -4,8 +4,6 @@
 #include <linux/pci.h>
 #include <linux/i2o.h>
 
-extern struct i2o_driver **i2o_drivers;
-extern unsigned int i2o_max_drivers;
 static void i2o_report_util_cmd(u8 cmd);
 static void i2o_report_exec_cmd(u8 cmd);
 static void i2o_report_fail_status(u8 req_status, u32 * msg);
@@ -23,7 +21,6 @@ void i2o_report_status(const char *severity, const char *str,
 	u8 cmd = (msg[1] >> 24) & 0xFF;
 	u8 req_status = (msg[4] >> 24) & 0xFF;
 	u16 detailed_status = msg[4] & 0xFFFF;
-	//struct i2o_driver *h = i2o_drivers[msg[2] & (i2o_max_drivers-1)];
 
 	if (cmd == I2O_CMD_UTIL_EVT_REGISTER)
 		return;		// No status in this reply
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index f1b7eb63d54b..0ee342ea29bc 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -16,9 +16,7 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
-
-/* Exec OSM functions */
-extern struct bus_type i2o_bus_type;
+#include "core.h"
 
 /**
  *	i2o_device_issue_claim - claim or release a device
@@ -293,12 +291,12 @@ int i2o_device_parse_lct(struct i2o_controller *c)
 	}
 
 	if (lct->table_size * 4 > c->dlct.len) {
-		memcpy_fromio(c->lct, c->dlct.virt, c->dlct.len);
+		memcpy(c->lct, c->dlct.virt, c->dlct.len);
 		up(&c->lct_lock);
 		return -EAGAIN;
 	}
 
-	memcpy_fromio(c->lct, c->dlct.virt, lct->table_size * 4);
+	memcpy(c->lct, c->dlct.virt, lct->table_size * 4);
 
 	lct = c->lct;
 
@@ -353,7 +351,7 @@ static ssize_t i2o_device_class_show_class_id(struct class_device *cd,
 {
 	struct i2o_device *dev = to_i2o_device(cd->dev);
 
-	sprintf(buf, "%03x\n", dev->lct_data.class_id);
+	sprintf(buf, "0x%03x\n", dev->lct_data.class_id);
 	return strlen(buf) + 1;
 };
 
@@ -368,7 +366,7 @@ static ssize_t i2o_device_class_show_tid(struct class_device *cd, char *buf)
 {
 	struct i2o_device *dev = to_i2o_device(cd->dev);
 
-	sprintf(buf, "%03x\n", dev->lct_data.tid);
+	sprintf(buf, "0x%03x\n", dev->lct_data.tid);
 	return strlen(buf) + 1;
 };
 
@@ -490,7 +488,7 @@ static int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 	if (rc == -ETIMEDOUT)
 		return rc;
 
-	memcpy_fromio(reslist, res.virt, res.len);
+	memcpy(reslist, res.virt, res.len);
 	i2o_dma_free(dev, &res);
 
 	/* Query failed */
@@ -532,17 +530,23 @@ int i2o_parm_field_get(struct i2o_device *i2o_dev, int group, int field,
 		       void *buf, int buflen)
 {
 	u16 opblk[] = { 1, 0, I2O_PARAMS_FIELD_GET, group, 1, field };
-	u8 resblk[8 + buflen];	/* 8 bytes for header */
+	u8 *resblk;		/* 8 bytes for header */
 	int size;
 
 	if (field == -1)	/* whole group */
 		opblk[4] = -1;
 
+	resblk = kmalloc(buflen + 8, GFP_KERNEL | GFP_ATOMIC);
+	if (!resblk)
+		return -ENOMEM;
+
 	size = i2o_parm_issue(i2o_dev, I2O_CMD_UTIL_PARAMS_GET, opblk,
 			      sizeof(opblk), resblk, buflen + 8);
 
 	memcpy(buf, resblk + 8, buflen);	/* cut off header */
 
+	kfree(resblk);
+
 	if (size > buflen)
 		return buflen;
 
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 393be8e2914c..c32f9dbc5744 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -17,11 +17,12 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <linux/i2o.h>
+#include "core.h"
 
 #define OSM_NAME	"i2o"
 
 /* max_drivers - Maximum I2O drivers (OSMs) which could be registered */
-unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
+static unsigned int i2o_max_drivers = I2O_MAX_DRIVERS;
 module_param_named(max_drivers, i2o_max_drivers, uint, 0);
 MODULE_PARM_DESC(max_drivers, "maximum number of OSM's to support");
 
@@ -179,15 +180,10 @@ void i2o_driver_unregister(struct i2o_driver *drv)
 int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 {
 	struct i2o_driver *drv;
-	struct i2o_message __iomem *msg = i2o_msg_out_to_virt(c, m);
-	u32 context;
+	struct i2o_message *msg = i2o_msg_out_to_virt(c, m);
+	u32 context = le32_to_cpu(msg->u.s.icntxt);
 	unsigned long flags;
 
-	if(unlikely(!msg))
-		return -EIO;
-
-	context = readl(&msg->u.s.icntxt);
-
 	if (unlikely(context >= i2o_max_drivers)) {
 		osm_warn("%s: Spurious reply to unknown driver %d\n", c->name,
 			 context);
@@ -204,11 +200,11 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 		return -EIO;
 	}
 
-	if ((readl(&msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_UTIL_EVT_REGISTER) {
 		struct i2o_device *dev, *tmp;
 		struct i2o_event *evt;
 		u16 size;
-		u16 tid = readl(&msg->u.head[1]) & 0xfff;
+		u16 tid = le32_to_cpu(msg->u.head[1]) & 0xfff;
 
 		osm_debug("event received from device %d\n", tid);
 
@@ -216,16 +212,16 @@ int i2o_driver_dispatch(struct i2o_controller *c, u32 m)
 			return -EIO;
 
 		/* cut of header from message size (in 32-bit words) */
-		size = (readl(&msg->u.head[0]) >> 16) - 5;
+		size = (le32_to_cpu(msg->u.head[0]) >> 16) - 5;
 
 		evt = kmalloc(size * 4 + sizeof(*evt), GFP_ATOMIC | __GFP_ZERO);
 		if (!evt)
 			return -ENOMEM;
 
 		evt->size = size;
-		evt->tcntxt = readl(&msg->u.s.tcntxt);
-		evt->event_indicator = readl(&msg->body[0]);
-		memcpy_fromio(&evt->tcntxt, &msg->u.s.tcntxt, size * 4);
+		evt->tcntxt = le32_to_cpu(msg->u.s.tcntxt);
+		evt->event_indicator = le32_to_cpu(msg->body[0]);
+		memcpy(&evt->tcntxt, &msg->u.s.tcntxt, size * 4);
 
 		list_for_each_entry_safe(dev, tmp, &c->devices, list)
 		    if (dev->lct_data.tid == tid) {
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index 0160221c802a..ffe0cecfa060 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -30,6 +30,7 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include "core.h"
 
 #define OSM_NAME "exec-osm"
 
@@ -37,9 +38,6 @@ struct i2o_driver i2o_exec_driver;
 
 static int i2o_exec_lct_notify(struct i2o_controller *c, u32 change_ind);
 
-/* Module internal functions from other sources */
-extern int i2o_device_parse_lct(struct i2o_controller *);
-
 /* global wait list for POST WAIT */
 static LIST_HEAD(i2o_exec_wait_list);
 
@@ -50,7 +48,7 @@ struct i2o_exec_wait {
 	u32 tcntxt;		/* transaction context from reply */
 	int complete;		/* 1 if reply received otherwise 0 */
 	u32 m;			/* message id */
-	struct i2o_message __iomem *msg;	/* pointer to the reply message */
+	struct i2o_message *msg;	/* pointer to the reply message */
 	struct list_head list;	/* node in global wait list */
 };
 
@@ -162,7 +160,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 	barrier();
 
 	if (wait->complete) {
-		rc = readl(&wait->msg->body[0]) >> 24;
+		rc = le32_to_cpu(wait->msg->body[0]) >> 24;
 		i2o_flush_reply(c, wait->m);
 		i2o_exec_wait_free(wait);
 	} else {
@@ -202,8 +200,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
  *	message must also be given back to the controller.
  */
 static int i2o_msg_post_wait_complete(struct i2o_controller *c, u32 m,
-				      struct i2o_message __iomem *msg,
-				      u32 context)
+				      struct i2o_message *msg, u32 context)
 {
 	struct i2o_exec_wait *wait, *tmp;
 	unsigned long flags;
@@ -378,11 +375,11 @@ static void i2o_exec_lct_modified(struct i2o_controller *c)
  *	code on failure and if the reply should be flushed.
  */
 static int i2o_exec_reply(struct i2o_controller *c, u32 m,
-			  struct i2o_message __iomem *msg)
+			  struct i2o_message *msg)
 {
 	u32 context;
 
-	if (readl(&msg->u.head[0]) & MSG_FAIL) {
+	if (le32_to_cpu(msg->u.head[0]) & MSG_FAIL) {
 		/*
 		 * If Fail bit is set we must take the transaction context of
 		 * the preserved message to find the right request again.
@@ -390,7 +387,7 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
 		struct i2o_message __iomem *pmsg;
 		u32 pm;
 
-		pm = readl(&msg->body[3]);
+		pm = le32_to_cpu(msg->body[3]);
 
 		pmsg = i2o_msg_in_to_virt(c, pm);
 
@@ -401,12 +398,12 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
 		/* Release the preserved msg */
 		i2o_msg_nop(c, pm);
 	} else
-		context = readl(&msg->u.s.tcntxt);
+		context = le32_to_cpu(msg->u.s.tcntxt);
 
 	if (context & 0x80000000)
 		return i2o_msg_post_wait_complete(c, m, msg, context);
 
-	if ((readl(&msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
+	if ((le32_to_cpu(msg->u.head[1]) >> 24) == I2O_CMD_LCT_NOTIFY) {
 		struct work_struct *work;
 
 		pr_debug("%s: LCT notify received\n", c->name);
@@ -442,9 +439,9 @@ static int i2o_exec_reply(struct i2o_controller *c, u32 m,
  */
 static void i2o_exec_event(struct i2o_event *evt)
 {
-	if(likely(evt->i2o_dev))
-		osm_info("Event received from device: %d\n",
-			 evt->i2o_dev->lct_data.tid);
+	if (likely(evt->i2o_dev))
+		osm_debug("Event received from device: %d\n",
+			  evt->i2o_dev->lct_data.tid);
 	kfree(evt);
 };
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 1dd2b9dad50e..28b3918dbc16 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -62,7 +62,7 @@
 #include "i2o_block.h"
 
 #define OSM_NAME	"block-osm"
-#define OSM_VERSION	"$Rev$"
+#define OSM_VERSION	"1.287"
 #define OSM_DESCRIPTION	"I2O Block Device OSM"
 
 static struct i2o_driver i2o_block_driver;
@@ -537,7 +537,7 @@ static int i2o_block_reply(struct i2o_controller *c, u32 m,
 
 static void i2o_block_event(struct i2o_event *evt)
 {
-	osm_info("event received\n");
+	osm_debug("event received\n");
 	kfree(evt);
 };
 
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 7636833b4623..8160a1f6c73a 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -36,6 +36,8 @@
 
 #include <asm/uaccess.h>
 
+#define SG_TABLESIZE		30
+
 extern int i2o_parm_issue(struct i2o_device *, int, void *, int, void *, int);
 
 static int i2o_cfg_ioctl(struct inode *inode, struct file *fp, unsigned int cmd,
@@ -663,7 +665,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 		goto sg_list_cleanup;
 
 	if (sg_offset) {
-		u32 msg[MSG_FRAME_SIZE];
+		u32 msg[I2O_OUTBOUND_MSG_FRAME_SIZE];
 		/* Copy back the Scatter Gather buffers back to user space */
 		u32 j;
 		// TODO 64bit fix
@@ -671,7 +673,7 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 		int sg_size;
 
 		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&msg, 0, MSG_FRAME_SIZE * 4);
+		memset(&msg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
 		// get user msg size in u32s
 		if (get_user(size, &user_msg[0])) {
 			rcode = -EFAULT;
@@ -902,7 +904,7 @@ static int i2o_cfg_passthru(unsigned long arg)
 		int sg_size;
 
 		// re-acquire the original message to handle correctly the sg copy operation
-		memset(&msg, 0, MSG_FRAME_SIZE * 4);
+		memset(&msg, 0, I2O_OUTBOUND_MSG_FRAME_SIZE * 4);
 		// get user msg size in u32s
 		if (get_user(size, &user_msg[0])) {
 			rcode = -EFAULT;
diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c
index fef53b509a61..9f1744c3933b 100644
--- a/drivers/message/i2o/i2o_scsi.c
+++ b/drivers/message/i2o/i2o_scsi.c
@@ -40,7 +40,6 @@
  *	Fix the resource management problems.
  */
 
-#define DEBUG 1
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -338,162 +337,89 @@ static int i2o_scsi_reply(struct i2o_controller *c, u32 m,
 			  struct i2o_message *msg)
 {
 	struct scsi_cmnd *cmd;
+	u32 error;
 	struct device *dev;
-	u8 as, ds, st;
 
 	cmd = i2o_cntxt_list_get(c, le32_to_cpu(msg->u.s.tcntxt));
-
-	if (msg->u.head[0] & (1 << 13)) {
-		struct i2o_message __iomem *pmsg;	/* preserved message */
-		u32 pm;
-		int err = DID_ERROR;
-
-		pm = le32_to_cpu(msg->body[3]);
-
-		pmsg = i2o_msg_in_to_virt(c, pm);
-
-		osm_err("IOP fail.\n");
-		osm_err("From %d To %d Cmd %d.\n",
-			(msg->u.head[1] >> 12) & 0xFFF,
-			msg->u.head[1] & 0xFFF, msg->u.head[1] >> 24);
-		osm_err("Failure Code %d.\n", msg->body[0] >> 24);
-		if (msg->body[0] & (1 << 16))
-			osm_err("Format error.\n");
-		if (msg->body[0] & (1 << 17))
-			osm_err("Path error.\n");
-		if (msg->body[0] & (1 << 18))
-			osm_err("Path State.\n");
-		if (msg->body[0] & (1 << 18))
-		{
-			osm_err("Congestion.\n");
-			err = DID_BUS_BUSY;
-		}
-
-		osm_debug("Failing message is %p.\n", pmsg);
-
-		cmd = i2o_cntxt_list_get(c, readl(&pmsg->u.s.tcntxt));
-		if (!cmd)
-			return 1;
-
-		cmd->result = err << 16;
-		cmd->scsi_done(cmd);
-
-		/* Now flush the message by making it a NOP */
-		i2o_msg_nop(c, pm);
-
-		return 1;
+	if (unlikely(!cmd)) {
+		osm_err("NULL reply received!\n");
+		return -1;
 	}
 
 	/*
 	 *      Low byte is device status, next is adapter status,
 	 *      (then one byte reserved), then request status.
 	 */
-	ds = (u8) le32_to_cpu(msg->body[0]);
-	as = (u8) (le32_to_cpu(msg->body[0]) >> 8);
-	st = (u8) (le32_to_cpu(msg->body[0]) >> 24);
+	error = le32_to_cpu(msg->body[0]);
 
+	osm_debug("Completed %ld\n", cmd->serial_number);
+
+	cmd->result = error & 0xff;
 	/*
-	 *      Is this a control request coming back - eg an abort ?
+	 * if DeviceStatus is not SCSI_SUCCESS copy over the sense data and let
+	 * the SCSI layer handle the error
 	 */
+	if (cmd->result)
+		memcpy(cmd->sense_buffer, &msg->body[3],
+		       min(sizeof(cmd->sense_buffer), (size_t) 40));
 
-	if (!cmd) {
-		if (st)
-			osm_warn("SCSI abort: %08X", le32_to_cpu(msg->body[0]));
-		osm_info("SCSI abort completed.\n");
-		return -EFAULT;
-	}
+	/* only output error code if AdapterStatus is not HBA_SUCCESS */
+	if ((error >> 8) & 0xff)
+		osm_err("SCSI error %08x\n", error);
 
-	osm_debug("Completed %ld\n", cmd->serial_number);
+	dev = &c->pdev->dev;
+	if (cmd->use_sg)
+		dma_unmap_sg(dev, cmd->request_buffer, cmd->use_sg,
+			     cmd->sc_data_direction);
+	else if (cmd->SCp.dma_handle)
+		dma_unmap_single(dev, cmd->SCp.dma_handle, cmd->request_bufflen,
+				 cmd->sc_data_direction);
 
-	if (st) {
-		u32 count, error;
-		/* An error has occurred */
-
-		switch (st) {
-		case 0x06:
-			count = le32_to_cpu(msg->body[1]);
-			if (count < cmd->underflow) {
-				int i;
-
-				osm_err("SCSI underflow 0x%08X 0x%08X\n", count,
-					cmd->underflow);
-				osm_debug("Cmd: ");
-				for (i = 0; i < 15; i++)
-					pr_debug("%02X ", cmd->cmnd[i]);
-				pr_debug(".\n");
-				cmd->result = (DID_ERROR << 16);
-			}
-			break;
+	cmd->scsi_done(cmd);
 
-		default:
-			error = le32_to_cpu(msg->body[0]);
-
-			osm_err("SCSI error %08x\n", error);
-
-			if ((error & 0xff) == 0x02 /*CHECK_CONDITION */ ) {
-				int i;
-				u32 len = sizeof(cmd->sense_buffer);
-				len = (len > 40) ? 40 : len;
-				// Copy over the sense data
-				memcpy(cmd->sense_buffer, (void *)&msg->body[3],
-				       len);
-				for (i = 0; i <= len; i++)
-					osm_info("%02x\n",
-						 cmd->sense_buffer[i]);
-				if (cmd->sense_buffer[0] == 0x70
-				    && cmd->sense_buffer[2] == DATA_PROTECT) {
-					/* This is to handle an array failed */
-					cmd->result = (DID_TIME_OUT << 16);
-					printk(KERN_WARNING "%s: SCSI Data "
-					       "Protect-Device (%d,%d,%d) "
-					       "hba_status=0x%x, dev_status="
-					       "0x%x, cmd=0x%x\n", c->name,
-					       (u32) cmd->device->channel,
-					       (u32) cmd->device->id,
-					       (u32) cmd->device->lun,
-					       (error >> 8) & 0xff,
-					       error & 0xff, cmd->cmnd[0]);
-				} else
-					cmd->result = (DID_ERROR << 16);
-
-				break;
-			}
-
-			switch (as) {
-			case 0x0E:
-				/* SCSI Reset */
-				cmd->result = DID_RESET << 16;
-				break;
-
-			case 0x0F:
-				cmd->result = DID_PARITY << 16;
-				break;
-
-			default:
-				cmd->result = DID_ERROR << 16;
-				break;
-			}
+	return 1;
+};
 
-			break;
-		}
+/**
+ *	i2o_scsi_notify_device_add - Retrieve notifications of added devices
+ *	@i2o_dev: the I2O device which was added
+ *
+ *	If a I2O device is added we catch the notification, because I2O classes
+ *	other then SCSI peripheral will not be received through
+ *	i2o_scsi_probe().
+ */
+static void i2o_scsi_notify_device_add(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_probe(&i2o_dev->device);
+		break;
 
-		cmd->scsi_done(cmd);
-		return 1;
+	default:
+		break;
 	}
+};
 
-	cmd->result = DID_OK << 16 | ds;
-
-	dev = &c->pdev->dev;
-	if (cmd->use_sg)
-		dma_unmap_sg(dev, (struct scatterlist *)cmd->buffer,
-			     cmd->use_sg, cmd->sc_data_direction);
-	else if (cmd->request_bufflen)
-		dma_unmap_single(dev, (dma_addr_t) ((long)cmd->SCp.ptr),
-				 cmd->request_bufflen, cmd->sc_data_direction);
-
-	cmd->scsi_done(cmd);
+/**
+ *	i2o_scsi_notify_device_remove - Retrieve notifications of removed
+ *				        devices
+ *	@i2o_dev: the I2O device which was removed
+ *
+ *	If a I2O device is removed, we catch the notification to remove the
+ *	corresponding SCSI device.
+ */
+static void i2o_scsi_notify_device_remove(struct i2o_device *i2o_dev)
+{
+	switch (i2o_dev->lct_data.class_id) {
+	case I2O_CLASS_EXECUTIVE:
+	case I2O_CLASS_RANDOM_BLOCK_STORAGE:
+		i2o_scsi_remove(&i2o_dev->device);
+		break;
 
-	return 1;
+	default:
+		break;
+	}
 };
 
 /**
@@ -554,6 +480,8 @@ static struct i2o_driver i2o_scsi_driver = {
 	.name = OSM_NAME,
 	.reply = i2o_scsi_reply,
 	.classes = i2o_scsi_class_id,
+	.notify_device_add = i2o_scsi_notify_device_add,
+	.notify_device_remove = i2o_scsi_notify_device_remove,
 	.notify_controller_add = i2o_scsi_notify_controller_add,
 	.notify_controller_remove = i2o_scsi_notify_controller_remove,
 	.driver = {
@@ -712,7 +640,7 @@ static int i2o_scsi_queuecommand(struct scsi_cmnd *SCpnt,
 	 */
 
 	/* Attach tags to the devices */
-	/*
+	/* FIXME: implement
 	   if(SCpnt->device->tagged_supported) {
 	   if(SCpnt->tag == HEAD_OF_QUEUE_TAG)
 	   scsi_flags |= 0x01000000;
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 40312053b38d..c32022bc2a21 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -28,8 +28,10 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include "core.h"
 
-#define OSM_VERSION	"$Rev$"
+#define OSM_NAME	"i2o"
+#define OSM_VERSION	"1.288"
 #define OSM_DESCRIPTION	"I2O subsystem"
 
 /* global I2O controller list */
@@ -43,20 +45,6 @@ static struct i2o_dma i2o_systab;
 
 static int i2o_hrt_get(struct i2o_controller *c);
 
-/* Module internal functions from other sources */
-extern struct i2o_driver i2o_exec_driver;
-extern int i2o_exec_lct_get(struct i2o_controller *);
-extern void i2o_device_remove(struct i2o_device *);
-
-extern int __init i2o_driver_init(void);
-extern void __exit i2o_driver_exit(void);
-extern int __init i2o_exec_init(void);
-extern void __exit i2o_exec_exit(void);
-extern int __init i2o_pci_init(void);
-extern void __exit i2o_pci_exit(void);
-extern int i2o_device_init(void);
-extern void i2o_device_exit(void);
-
 /**
  *	i2o_msg_nop - Returns a message which is not used
  *	@c: I2O controller from which the message was created
@@ -92,16 +80,16 @@ void i2o_msg_nop(struct i2o_controller *c, u32 m)
  *	address from the read port (see the i2o spec). If no message is
  *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
  */
-u32 i2o_msg_get_wait(struct i2o_controller *c, struct i2o_message __iomem **msg,
-		     int wait)
+u32 i2o_msg_get_wait(struct i2o_controller *c,
+		     struct i2o_message __iomem ** msg, int wait)
 {
 	unsigned long timeout = jiffies + wait * HZ;
 	u32 m;
 
 	while ((m = i2o_msg_get(c, msg)) == I2O_QUEUE_EMPTY) {
 		if (time_after(jiffies, timeout)) {
-			pr_debug("%s: Timeout waiting for message frame.\n",
-				 c->name);
+			osm_debug("%s: Timeout waiting for message frame.\n",
+				  c->name);
 			return I2O_QUEUE_EMPTY;
 		}
 		set_current_state(TASK_UNINTERRUPTIBLE);
@@ -466,7 +454,7 @@ static int i2o_iop_clear(struct i2o_controller *c)
  */
 static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 {
-	u8 *status = c->status.virt;
+	volatile u8 *status = c->status.virt;
 	u32 m;
 	struct i2o_message __iomem *msg;
 	ulong timeout;
@@ -474,21 +462,20 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 
 	osm_debug("%s: Initializing Outbound Queue...\n", c->name);
 
-	memset(status, 0, 4);
+	memset(c->status.virt, 0, 4);
 
 	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
 	if (m == I2O_QUEUE_EMPTY)
 		return -ETIMEDOUT;
 
-	writel(EIGHT_WORD_MSG_SIZE | TRL_OFFSET_6, &msg->u.head[0]);
+	writel(EIGHT_WORD_MSG_SIZE | SGL_OFFSET_6, &msg->u.head[0]);
 	writel(I2O_CMD_OUTBOUND_INIT << 24 | HOST_TID << 12 | ADAPTER_TID,
 	       &msg->u.head[1]);
 	writel(i2o_exec_driver.context, &msg->u.s.icntxt);
-	writel(0x0106, &msg->u.s.tcntxt);	/* FIXME: why 0x0106, maybe in
-						   Spec? */
+	writel(0x00000000, &msg->u.s.tcntxt);
 	writel(PAGE_SIZE, &msg->body[0]);
 	/* Outbound msg frame size in words and Initcode */
-	writel(MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);
+	writel(I2O_OUTBOUND_MSG_FRAME_SIZE << 16 | 0x80, &msg->body[1]);
 	writel(0xd0000004, &msg->body[2]);
 	writel(i2o_dma_low(c->status.phys), &msg->body[3]);
 	writel(i2o_dma_high(c->status.phys), &msg->body[4]);
@@ -503,17 +490,15 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
 		}
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
-
-		rmb();
 	}
 
 	m = c->out_queue.phys;
 
 	/* Post frames */
-	for (i = 0; i < NMBR_MSG_FRAMES; i++) {
+	for (i = 0; i < I2O_MAX_OUTBOUND_MSG_FRAMES; i++) {
 		i2o_flush_reply(c, m);
 		udelay(1);	/* Promise */
-		m += MSG_FRAME_SIZE * 4;
+		m += I2O_OUTBOUND_MSG_FRAME_SIZE * sizeof(u32);
 	}
 
 	return 0;
@@ -530,20 +515,20 @@ static int i2o_iop_init_outbound_queue(struct i2o_controller *c)
  */
 static int i2o_iop_reset(struct i2o_controller *c)
 {
-	u8 *status = c->status.virt;
+	volatile u8 *status = c->status.virt;
 	struct i2o_message __iomem *msg;
 	u32 m;
 	unsigned long timeout;
 	i2o_status_block *sb = c->status_block.virt;
 	int rc = 0;
 
-	pr_debug("%s: Resetting controller\n", c->name);
+	osm_debug("%s: Resetting controller\n", c->name);
 
 	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
 	if (m == I2O_QUEUE_EMPTY)
 		return -ETIMEDOUT;
 
-	memset(status, 0, 8);
+	memset(c->status_block.virt, 0, 8);
 
 	/* Quiesce all IOPs first */
 	i2o_iop_quiesce_all();
@@ -568,8 +553,6 @@ static int i2o_iop_reset(struct i2o_controller *c)
 
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
-
-		rmb();
 	}
 
 	switch (*status) {
@@ -984,11 +967,11 @@ int i2o_status_get(struct i2o_controller *c)
 {
 	struct i2o_message __iomem *msg;
 	u32 m;
-	u8 *status_block;
+	volatile u8 *status_block;
 	unsigned long timeout;
 
 	status_block = (u8 *) c->status_block.virt;
-	memset(status_block, 0, sizeof(i2o_status_block));
+	memset(c->status_block.virt, 0, sizeof(i2o_status_block));
 
 	m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_MESSAGE_GET);
 	if (m == I2O_QUEUE_EMPTY)
@@ -1017,8 +1000,6 @@ int i2o_status_get(struct i2o_controller *c)
 
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
-
-		rmb();
 	}
 
 #ifdef DEBUG
@@ -1107,6 +1088,11 @@ static void i2o_iop_release(struct device *dev)
 	i2o_iop_free(c);
 };
 
+/* I2O controller class */
+static struct class i2o_controller_class = {
+	.name = "i2o_controller",
+};
+
 /**
  *	i2o_iop_alloc - Allocate and initialize a i2o_controller struct
  *
@@ -1136,8 +1122,14 @@ struct i2o_controller *i2o_iop_alloc(void)
 	sprintf(c->name, "iop%d", c->unit);
 
 	device_initialize(&c->device);
+	class_device_initialize(&c->classdev);
+
 	c->device.release = &i2o_iop_release;
+	c->classdev.class = &i2o_controller_class;
+	c->classdev.dev = &c->device;
+
 	snprintf(c->device.bus_id, BUS_ID_SIZE, "iop%d", c->unit);
+	snprintf(c->classdev.class_id, BUS_ID_SIZE, "iop%d", c->unit);
 
 #if BITS_PER_LONG == 64
 	spin_lock_init(&c->context_list_lock);
@@ -1161,45 +1153,55 @@ int i2o_iop_add(struct i2o_controller *c)
 {
 	int rc;
 
-	if((rc = device_add(&c->device))) {
-		printk(KERN_ERR "%s: could not register controller\n", c->name);
+	if ((rc = device_add(&c->device))) {
+		osm_err("%s: could not add controller\n", c->name);
 		goto iop_reset;
 	}
 
-	printk(KERN_INFO "%s: Activating I2O controller...\n", c->name);
-	printk(KERN_INFO "%s: This may take a few minutes if there are many "
-	       "devices\n", c->name);
+	if ((rc = class_device_add(&c->classdev))) {
+		osm_err("%s: could not add controller class\n", c->name);
+		goto device_del;
+	}
+
+	osm_info("%s: Activating I2O controller...\n", c->name);
+	osm_info("%s: This may take a few minutes if there are many devices\n",
+		 c->name);
 
 	if ((rc = i2o_iop_activate(c))) {
-		printk(KERN_ERR "%s: could not activate controller\n",
-		       c->name);
-		goto iop_reset;
+		osm_err("%s: could not activate controller\n", c->name);
+		goto class_del;
 	}
 
-	pr_debug("%s: building sys table...\n", c->name);
+	osm_debug("%s: building sys table...\n", c->name);
 
 	if ((rc = i2o_systab_build()))
-		goto iop_reset;
+		goto class_del;
 
-	pr_debug("%s: online controller...\n", c->name);
+	osm_debug("%s: online controller...\n", c->name);
 
 	if ((rc = i2o_iop_online(c)))
-		goto iop_reset;
+		goto class_del;
 
-	pr_debug("%s: getting LCT...\n", c->name);
+	osm_debug("%s: getting LCT...\n", c->name);
 
 	if ((rc = i2o_exec_lct_get(c)))
-		goto iop_reset;
+		goto class_del;
 
 	list_add(&c->list, &i2o_controllers);
 
 	i2o_driver_notify_controller_add_all(c);
 
-	printk(KERN_INFO "%s: Controller added\n", c->name);
+	osm_info("%s: Controller added\n", c->name);
 
 	return 0;
 
-iop_reset:
+      class_del:
+	class_device_del(&c->classdev);
+
+      device_del:
+	device_del(&c->device);
+
+      iop_reset:
 	i2o_iop_reset(c);
 
 	return rc;
@@ -1260,16 +1262,18 @@ static int __init i2o_iop_init(void)
 	if (rc)
 		goto exit;
 
-	rc = i2o_driver_init();
-	if (rc)
+	if ((rc = class_register(&i2o_controller_class))) {
+		osm_err("can't register class i2o_controller\n");
 		goto device_exit;
+	}
 
-	rc = i2o_exec_init();
-	if (rc)
+	if ((rc = i2o_driver_init()))
+		goto class_exit;
+
+	if ((rc = i2o_exec_init()))
 		goto driver_exit;
 
-	rc = i2o_pci_init();
-	if (rc < 0)
+	if ((rc = i2o_pci_init()))
 		goto exec_exit;
 
 	return 0;
@@ -1280,6 +1284,9 @@ static int __init i2o_iop_init(void)
       driver_exit:
 	i2o_driver_exit();
 
+      class_exit:
+	class_unregister(&i2o_controller_class);
+
       device_exit:
 	i2o_device_exit();
 
@@ -1297,6 +1304,7 @@ static void __exit i2o_iop_exit(void)
 	i2o_pci_exit();
 	i2o_exec_exit();
 	i2o_driver_exit();
+	class_unregister(&i2o_controller_class);
 	i2o_device_exit();
 };
 
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 964fe481849e..442e34506b90 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -30,15 +30,7 @@
 #include <linux/pci.h>
 #include <linux/interrupt.h>
 #include <linux/i2o.h>
-
-/* Module internal functions from other sources */
-extern struct i2o_controller *i2o_iop_alloc(void);
-extern void i2o_iop_free(struct i2o_controller *);
-
-extern int i2o_iop_add(struct i2o_controller *);
-extern void i2o_iop_remove(struct i2o_controller *);
-
-extern int i2o_driver_dispatch(struct i2o_controller *, u32);
+#include "core.h"
 
 /* PCI device id table for all I2O controllers */
 static struct pci_device_id __devinitdata i2o_pci_ids[] = {
@@ -248,9 +240,7 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
 	struct pci_dev *pdev = c->pdev;
 	int rc;
 
-	wmb();
 	writel(0xffffffff, c->irq_mask);
-	wmb();
 
 	if (pdev->irq) {
 		rc = request_irq(pdev->irq, i2o_pci_interrupt, SA_SHIRQ,
@@ -263,7 +253,6 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
 	}
 
 	writel(0x00000000, c->irq_mask);
-	wmb();
 
 	printk(KERN_INFO "%s: Installed at IRQ %d\n", c->name, pdev->irq);
 
@@ -278,9 +267,7 @@ static int i2o_pci_irq_enable(struct i2o_controller *c)
  */
 static void i2o_pci_irq_disable(struct i2o_controller *c)
 {
-	wmb();
 	writel(0xffffffff, c->irq_mask);
-	wmb();
 
 	if (c->pdev->irq > 0)
 		free_irq(c->pdev->irq, c);
@@ -406,11 +393,11 @@ static int __devinit i2o_pci_probe(struct pci_dev *pdev,
 	if ((rc = i2o_iop_add(c)))
 		goto uninstall;
 
+	get_device(&c->device);
+
 	if (i960)
 		pci_write_config_word(i960, 0x42, 0x03ff);
 
-	get_device(&c->device);
-
 	return 0;
 
       uninstall:
@@ -478,6 +465,4 @@ void __exit i2o_pci_exit(void)
 {
 	pci_unregister_driver(&i2o_pci_driver);
 };
-
-EXPORT_SYMBOL(i2o_dma_realloc);
 MODULE_DEVICE_TABLE(pci, i2o_pci_ids);
diff --git a/include/linux/i2o-dev.h b/include/linux/i2o-dev.h
index d4a08d29e36d..36fd18cdad28 100644
--- a/include/linux/i2o-dev.h
+++ b/include/linux/i2o-dev.h
@@ -32,18 +32,6 @@ typedef unsigned int u32;
 
 #endif				/* __KERNEL__ */
 
-/*
- *	Software module types
- */
-#define I2O_SOFTWARE_MODULE_IRTOS		0x11
-#define I2O_SOFTWARE_MODULE_IOP_PRIVATE		0x22
-#define I2O_SOFTWARE_MODULE_IOP_CONFIG		0x23
-
-/*
- *	Vendors
- */
-#define I2O_VENDOR_DPT				0x001b
-
 /*
  * I2O Control IOCTLs and structures
  */
@@ -414,6 +402,17 @@ typedef struct _i2o_status_block {
 #define ADAPTER_STATE_FAILED			0x10
 #define ADAPTER_STATE_FAULTED			0x11
 
+/*
+ *	Software module types
+ */
+#define I2O_SOFTWARE_MODULE_IRTOS		0x11
+#define I2O_SOFTWARE_MODULE_IOP_PRIVATE		0x22
+#define I2O_SOFTWARE_MODULE_IOP_CONFIG		0x23
+
+/*
+ *	Vendors
+ */
+#define I2O_VENDOR_DPT				0x001b
 
 /*
  * DPT / Adaptec specific values for i2o_sg_io_hdr flags.
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 2039a87c2b91..be937d0372a7 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -119,12 +119,21 @@ struct i2o_driver {
 };
 
 /*
- *	Contains all information which are necessary for DMA operations
+ *	Contains DMA mapped address information
  */
 struct i2o_dma {
 	void *virt;
 	dma_addr_t phys;
-	u32 len;
+	size_t len;
+};
+
+/*
+ *	Contains IO mapped address information
+ */
+struct i2o_io {
+	void __iomem *virt;
+	unsigned long phys;
+	unsigned long len;
 };
 
 /*
@@ -173,8 +182,8 @@ struct i2o_controller {
 	struct semaphore lct_lock;	/* Lock for LCT updates */
 	struct i2o_dma status_block;	/* IOP status block */
 
-	struct i2o_dma base;	/* controller messaging unit */
-	struct i2o_dma in_queue;	/* inbound message queue Host->IOP */
+	struct i2o_io base;	/* controller messaging unit */
+	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
 	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
 
 	unsigned int battery:1;		/* Has a battery backup */
@@ -185,6 +194,7 @@ struct i2o_controller {
 	struct resource mem_resource;	/* Mem resource allocated to the IOP */
 
 	struct device device;
+	struct class_device classdev;	/* I2O controller class */
 	struct i2o_device *exec;	/* Executive */
 #if BITS_PER_LONG == 64
 	spinlock_t context_list_lock;	/* lock for context_list */
@@ -235,9 +245,10 @@ struct i2o_sys_tbl {
 extern struct list_head i2o_controllers;
 
 /* Message functions */
-static inline u32 i2o_msg_get(struct i2o_controller *, struct i2o_message __iomem **);
-extern u32 i2o_msg_get_wait(struct i2o_controller *, struct i2o_message __iomem **,
-			    int);
+static inline u32 i2o_msg_get(struct i2o_controller *,
+			      struct i2o_message __iomem **);
+extern u32 i2o_msg_get_wait(struct i2o_controller *,
+			    struct i2o_message __iomem **, int);
 static inline void i2o_msg_post(struct i2o_controller *, u32);
 static inline int i2o_msg_post_wait(struct i2o_controller *, u32,
 				    unsigned long);
@@ -638,14 +649,12 @@ extern int i2o_exec_lct_get(struct i2o_controller *);
  *	available returns I2O_QUEUE_EMPTY and msg is leaved untouched.
  */
 static inline u32 i2o_msg_get(struct i2o_controller *c,
-			      struct i2o_message __iomem **msg)
+			      struct i2o_message __iomem ** msg)
 {
 	u32 m = readl(c->in_port);
 
-	if (m != I2O_QUEUE_EMPTY) {
+	if (m != I2O_QUEUE_EMPTY)
 		*msg = c->in_queue.virt + m;
-		rmb();
-	}
 
 	return m;
 };
@@ -659,7 +668,6 @@ static inline u32 i2o_msg_get(struct i2o_controller *c,
  */
 static inline void i2o_msg_post(struct i2o_controller *c, u32 m)
 {
-	wmb();
 	writel(m, c->in_port);
 };
 
@@ -706,14 +714,11 @@ static inline void i2o_flush_reply(struct i2o_controller *c, u32 m)
  *	work for sender side messages as they are ioremap objects
  *	provided by the I2O controller.
  */
-static inline struct i2o_message __iomem *i2o_msg_out_to_virt(struct
-							      i2o_controller *c,
-							      u32 m)
+static inline struct i2o_message *i2o_msg_out_to_virt(struct i2o_controller *c,
+						      u32 m)
 {
-	if (unlikely
-	    (m < c->out_queue.phys
-	     || m >= c->out_queue.phys + c->out_queue.len))
-		return NULL;
+	BUG_ON(m < c->out_queue.phys
+	       || m >= c->out_queue.phys + c->out_queue.len);
 
 	return c->out_queue.virt + (m - c->out_queue.phys);
 };
@@ -729,69 +734,13 @@ static inline struct i2o_message __iomem *i2o_msg_out_to_virt(struct
  *	work for receive side messages as they are kmalloc objects
  *	in a different pool.
  */
-static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct i2o_controller *c,
-						     u32 m)
+static inline struct i2o_message __iomem *i2o_msg_in_to_virt(struct
+							     i2o_controller *c,
+							     u32 m)
 {
 	return c->in_queue.virt + m;
 };
 
-/**
- *	i2o_dma_alloc - Allocate DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which should get the DMA buffer
- *	@len: length of the new DMA memory
- *	@gfp_mask: GFP mask
- *
- *	Allocate a coherent DMA memory and write the pointers into addr.
- *
- *	Returns 0 on success or -ENOMEM on failure.
- */
-static inline int i2o_dma_alloc(struct device *dev, struct i2o_dma *addr,
-				size_t len, unsigned int gfp_mask)
-{
-	struct pci_dev *pdev = to_pci_dev(dev);
-	int dma_64 = 0;
-
-	if ((sizeof(dma_addr_t) > 4) && (pdev->dma_mask == DMA_64BIT_MASK)) {
-			dma_64 = 1;
-			if(pci_set_dma_mask(pdev, DMA_32BIT_MASK))
-				return -ENOMEM;
-	}
-
-	addr->virt = dma_alloc_coherent(dev, len, &addr->phys, gfp_mask);
-
-	if ((sizeof(dma_addr_t) > 4) && dma_64)
-		if(pci_set_dma_mask(pdev, DMA_64BIT_MASK))
-			printk(KERN_WARNING "i2o: unable to set 64-bit DMA");
-
-	if (!addr->virt)
-		return -ENOMEM;
-
-	memset(addr->virt, 0, len);
-	addr->len = len;
-
-	return 0;
-};
-
-/**
- *	i2o_dma_free - Free DMA memory
- *	@dev: struct device pointer to the PCI device of the I2O controller
- *	@addr: i2o_dma struct which contains the DMA buffer
- *
- *	Free a coherent DMA memory and set virtual address of addr to NULL.
- */
-static inline void i2o_dma_free(struct device *dev, struct i2o_dma *addr)
-{
-	if (addr->virt) {
-		if (addr->phys)
-			dma_free_coherent(dev, addr->len, addr->virt,
-					  addr->phys);
-		else
-			kfree(addr->virt);
-		addr->virt = NULL;
-	}
-};
-
 /*
  *	Endian handling wrapped into the macro - keeps the core code
  *	cleaner.
@@ -1141,16 +1090,13 @@ extern void i2o_debug_state(struct i2o_controller *c);
 #define ELEVEN_WORD_MSG_SIZE	0x000B0000
 #define I2O_MESSAGE_SIZE(x)	((x)<<16)
 
-/* Special TID Assignments */
-
+/* special TID assignments */
 #define ADAPTER_TID		0
 #define HOST_TID		1
 
-#define MSG_FRAME_SIZE		128	/* i2o_scsi assumes >= 32 */
-#define SG_TABLESIZE		30
-#define NMBR_MSG_FRAMES		128
-
-#define MSG_POOL_SIZE		(MSG_FRAME_SIZE*NMBR_MSG_FRAMES*sizeof(u32))
+/* outbound queue defines */
+#define I2O_MAX_OUTBOUND_MSG_FRAMES	128
+#define I2O_OUTBOUND_MSG_FRAME_SIZE	128	/* in 32-bit words */
 
 #define I2O_POST_WAIT_OK	0
 #define I2O_POST_WAIT_TIMEOUT	-ETIMEDOUT
-- 
cgit v1.2.3-59-g8ed1b


From f33213ecf49c98da4e85121b592c3bea8057c2e6 Mon Sep 17 00:00:00 2001
From: Markus Lidel <Markus.Lidel@shadowconnect.com>
Date: Thu, 23 Jun 2005 22:02:23 -0700
Subject: [PATCH] I2O: Lindent run and replacement of printk through osm
 printing functions

Lindent run and replaced printk() through the corresponding osm_*() function

Signed-off-by: Markus Lidel <Markus.Lidel@shadowconnect.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/message/i2o/Kconfig      |  10 +--
 drivers/message/i2o/device.c     |   1 -
 drivers/message/i2o/driver.c     |   3 +-
 drivers/message/i2o/exec-osm.c   |   2 +-
 drivers/message/i2o/i2o_block.c  |   3 +-
 drivers/message/i2o/i2o_block.h  |  28 ++++-----
 drivers/message/i2o/i2o_config.c |  20 +++---
 drivers/message/i2o/i2o_proc.c   |   2 +-
 drivers/message/i2o/iop.c        | 128 ++++++++++++++++++---------------------
 drivers/message/i2o/pci.c        |   5 +-
 include/linux/i2o.h              |   8 +--
 11 files changed, 100 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/Kconfig b/drivers/message/i2o/Kconfig
index 94b6d676c5cb..06e8eb19a05c 100644
--- a/drivers/message/i2o/Kconfig
+++ b/drivers/message/i2o/Kconfig
@@ -44,8 +44,8 @@ config I2O_EXT_ADAPTEC_DMA64
 
 config I2O_CONFIG
 	tristate "I2O Configuration support"
-	depends on PCI && I2O
-	help
+	depends on I2O
+	---help---
 	  Say Y for support of the configuration interface for the I2O adapters.
 	  If you have a RAID controller from Adaptec and you want to use the
 	  raidutils to manage your RAID array, you have to say Y here.
@@ -74,7 +74,7 @@ config I2O_BUS
 config I2O_BLOCK
 	tristate "I2O Block OSM"
 	depends on I2O
-	help
+	---help---
 	  Include support for the I2O Block OSM. The Block OSM presents disk
 	  and other structured block devices to the operating system. If you
 	  are using an RAID controller, you could access the array only by
@@ -87,7 +87,7 @@ config I2O_BLOCK
 config I2O_SCSI
 	tristate "I2O SCSI OSM"
 	depends on I2O && SCSI
-	help
+	---help---
 	  Allows direct SCSI access to SCSI devices on a SCSI or FibreChannel
 	  I2O controller. You can use both the SCSI and Block OSM together if
 	  you wish. To access a RAID array, you must use the Block OSM driver.
@@ -99,7 +99,7 @@ config I2O_SCSI
 config I2O_PROC
 	tristate "I2O /proc support"
 	depends on I2O
-	help
+	---help---
 	  If you say Y here and to "/proc file system support", you will be
 	  able to read I2O related information from the virtual directory
 	  /proc/i2o.
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index 0ee342ea29bc..d8d6e89a91cc 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -443,7 +443,6 @@ static struct class_interface i2o_device_class_interface = {
  *	Note that the minimum sized reslist is 8 bytes and contains
  *	ResultCount, ErrorInfoSize, BlockStatus and BlockSize.
  */
-
 static int i2o_parm_issue(struct i2o_device *i2o_dev, int cmd, void *oplist,
 			  int oplen, void *reslist, int reslen)
 {
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index c32f9dbc5744..739bfdef0c6d 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -117,10 +117,9 @@ int i2o_driver_register(struct i2o_driver *drv)
 
 		i2o_driver_notify_controller_add(drv, c);
 		list_for_each_entry(i2o_dev, &c->devices, list)
-			i2o_driver_notify_device_add(drv, i2o_dev);
+		    i2o_driver_notify_device_add(drv, i2o_dev);
 	}
 
-
 	rc = driver_register(&drv->driver);
 	if (rc)
 		destroy_workqueue(drv->event_queue);
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index ffe0cecfa060..1b7389876e70 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -152,7 +152,7 @@ int i2o_msg_post_wait_mem(struct i2o_controller *c, u32 m, unsigned long
 		list_add(&wait->list, &i2o_exec_wait_list);
 
 		wait_event_interruptible_timeout(wq, wait->complete,
-			timeout * HZ);
+						 timeout * HZ);
 
 		wait->wq = NULL;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 28b3918dbc16..f283b5bafdd3 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -940,7 +940,6 @@ static void i2o_block_request_fn(struct request_queue *q)
 			INIT_WORK(&dreq->work, i2o_block_delayed_request_fn,
 				  dreq);
 
-			osm_info("transfer error\n");
 			if (!queue_delayed_work(i2o_block_driver.event_queue,
 						&dreq->work,
 						I2O_BLOCK_RETRY_TIME))
@@ -1042,8 +1041,8 @@ static struct i2o_block_device *i2o_block_device_alloc(void)
 static int i2o_block_probe(struct device *dev)
 {
 	struct i2o_device *i2o_dev = to_i2o_device(dev);
-	struct i2o_block_device *i2o_blk_dev;
 	struct i2o_controller *c = i2o_dev->iop;
+	struct i2o_block_device *i2o_blk_dev;
 	struct gendisk *gd;
 	struct request_queue *queue;
 	static int unit = 0;
diff --git a/drivers/message/i2o/i2o_block.h b/drivers/message/i2o/i2o_block.h
index e45cc40ce384..4fdaa5bda412 100644
--- a/drivers/message/i2o/i2o_block.h
+++ b/drivers/message/i2o/i2o_block.h
@@ -64,40 +64,38 @@
 
 /* I2O Block OSM mempool struct */
 struct i2o_block_mempool {
-	kmem_cache_t	*slab;
-	mempool_t	*pool;
+	kmem_cache_t *slab;
+	mempool_t *pool;
 };
 
 /* I2O Block device descriptor */
 struct i2o_block_device {
 	struct i2o_device *i2o_dev;	/* pointer to I2O device */
 	struct gendisk *gd;
-	spinlock_t lock;		/* queue lock */
+	spinlock_t lock;	/* queue lock */
 	struct list_head open_queue;	/* list of transfered, but unfinished
 					   requests */
 	unsigned int open_queue_depth;	/* number of requests in the queue */
 
-	int rcache;			/* read cache flags */
-	int wcache;			/* write cache flags */
+	int rcache;		/* read cache flags */
+	int wcache;		/* write cache flags */
 	int flags;
-	u16 power;			/* power state */
-	int media_change_flag;		/* media changed flag */
+	u16 power;		/* power state */
+	int media_change_flag;	/* media changed flag */
 };
 
 /* I2O Block device request */
-struct i2o_block_request
-{
+struct i2o_block_request {
 	struct list_head queue;
-	struct request *req;		/* corresponding request */
+	struct request *req;	/* corresponding request */
 	struct i2o_block_device *i2o_blk_dev;	/* I2O block device */
-	struct device *dev;		/* device used for DMA */
-	int sg_nents;			/* number of SG elements */
-	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS]; /* SG table */
+	struct device *dev;	/* device used for DMA */
+	int sg_nents;		/* number of SG elements */
+	struct scatterlist sg_table[I2O_MAX_PHYS_SEGMENTS];	/* SG table */
 };
 
 /* I2O Block device delayed request */
-struct i2o_block_delayed_request
-{
+struct i2o_block_delayed_request {
 	struct work_struct work;
 	struct request_queue *queue;
 };
diff --git a/drivers/message/i2o/i2o_config.c b/drivers/message/i2o/i2o_config.c
index 8160a1f6c73a..8ebc86ff1002 100644
--- a/drivers/message/i2o/i2o_config.c
+++ b/drivers/message/i2o/i2o_config.c
@@ -368,9 +368,9 @@ static int i2o_cfg_swul(unsigned long arg)
 
 	i2o_dma_free(&c->pdev->dev, &buffer);
 
-return_ret:
+      return_ret:
 	return ret;
-return_fault:
+      return_fault:
 	ret = -EFAULT;
 	goto return_ret;
 };
@@ -519,7 +519,8 @@ static int i2o_cfg_evt_get(unsigned long arg, struct file *fp)
 
 #ifdef CONFIG_I2O_EXT_ADAPTEC
 #ifdef CONFIG_COMPAT
-static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long arg)
+static int i2o_cfg_passthru32(struct file *file, unsigned cmnd,
+			      unsigned long arg)
 {
 	struct i2o_cmd_passthru32 __user *cmd;
 	struct i2o_controller *c;
@@ -646,8 +647,9 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 			    flag_count & 0x04000000 /*I2O_SGL_FLAGS_DIR */ ) {
 				// TODO 64bit fix
 				if (copy_from_user
-				    (p->virt, (void __user *)(unsigned long)sg[i].addr_bus,
-				     sg_size)) {
+				    (p->virt,
+				     (void __user *)(unsigned long)sg[i].
+				     addr_bus, sg_size)) {
 					printk(KERN_DEBUG
 					       "%s: Could not copy SG buf %d FROM user\n",
 					       c->name, i);
@@ -738,11 +740,12 @@ static int i2o_cfg_passthru32(struct file *file, unsigned cmnd, unsigned long ar
 	return rcode;
 }
 
-static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
+static long i2o_cfg_compat_ioctl(struct file *file, unsigned cmd,
+				 unsigned long arg)
 {
 	int ret;
-	lock_kernel();		
-	switch (cmd) { 
+	lock_kernel();
+	switch (cmd) {
 	case I2OGETIOPS:
 		ret = i2o_cfg_ioctl(NULL, file, cmd, arg);
 		break;
@@ -1136,6 +1139,7 @@ static int __init i2o_config_old_init(void)
 		osm_err("can't register device.\n");
 		return -EBUSY;
 	}
+
 	return 0;
 }
 
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index e5b74452c495..d559a1758363 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -28,7 +28,7 @@
  */
 
 #define OSM_NAME	"proc-osm"
-#define OSM_VERSION	"$Rev$"
+#define OSM_VERSION	"1.145"
 #define OSM_DESCRIPTION	"I2O ProcFS OSM"
 
 #define I2O_MAX_MODULES 4
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index c32022bc2a21..42f8b810d6e5 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -117,13 +117,13 @@ u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
 	unsigned long flags;
 
 	if (!ptr)
-		printk(KERN_ERR "%s: couldn't add NULL pointer to context list!"
-		       "\n", c->name);
+		osm_err("%s: couldn't add NULL pointer to context list!\n",
+			c->name);
 
 	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
 	if (!entry) {
-		printk(KERN_ERR "%s: Could not allocate memory for context "
-		       "list element\n", c->name);
+		osm_err("%s: Could not allocate memory for context list element"
+			"\n", c->name);
 		return 0;
 	}
 
@@ -142,7 +142,7 @@ u32 i2o_cntxt_list_add(struct i2o_controller * c, void *ptr)
 
 	spin_unlock_irqrestore(&c->context_list_lock, flags);
 
-	pr_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
+	osm_debug("%s: Add context to list %p -> %d\n", c->name, ptr, context);
 
 	return entry->context;
 };
@@ -174,11 +174,11 @@ u32 i2o_cntxt_list_remove(struct i2o_controller * c, void *ptr)
 	spin_unlock_irqrestore(&c->context_list_lock, flags);
 
 	if (!context)
-		printk(KERN_WARNING "%s: Could not remove nonexistent ptr "
-		       "%p\n", c->name, ptr);
+		osm_warn("%s: Could not remove nonexistent ptr %p\n", c->name,
+			 ptr);
 
-	pr_debug("%s: remove ptr from context list %d -> %p\n", c->name,
-		 context, ptr);
+	osm_debug("%s: remove ptr from context list %d -> %p\n", c->name,
+		  context, ptr);
 
 	return context;
 };
@@ -208,11 +208,10 @@ void *i2o_cntxt_list_get(struct i2o_controller *c, u32 context)
 	spin_unlock_irqrestore(&c->context_list_lock, flags);
 
 	if (!ptr)
-		printk(KERN_WARNING "%s: context id %d not found\n", c->name,
-		       context);
+		osm_warn("%s: context id %d not found\n", c->name, context);
 
-	pr_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
-		 ptr);
+	osm_debug("%s: get ptr from context list %d -> %p\n", c->name, context,
+		  ptr);
 
 	return ptr;
 };
@@ -240,11 +239,11 @@ u32 i2o_cntxt_list_get_ptr(struct i2o_controller * c, void *ptr)
 	spin_unlock_irqrestore(&c->context_list_lock, flags);
 
 	if (!context)
-		printk(KERN_WARNING "%s: Could not find nonexistent ptr "
-		       "%p\n", c->name, ptr);
+		osm_warn("%s: Could not find nonexistent ptr %p\n", c->name,
+			 ptr);
 
-	pr_debug("%s: get context id from context list %p -> %d\n", c->name,
-		 ptr, context);
+	osm_debug("%s: get context id from context list %p -> %d\n", c->name,
+		  ptr, context);
 
 	return context;
 };
@@ -324,10 +323,9 @@ static int i2o_iop_quiesce(struct i2o_controller *c)
 
 	/* Long timeout needed for quiesce if lots of devices */
 	if ((rc = i2o_msg_post_wait(c, m, 240)))
-		printk(KERN_INFO "%s: Unable to quiesce (status=%#x).\n",
-		       c->name, -rc);
+		osm_info("%s: Unable to quiesce (status=%#x).\n", c->name, -rc);
 	else
-		pr_debug("%s: Quiesced.\n", c->name);
+		osm_debug("%s: Quiesced.\n", c->name);
 
 	i2o_status_get(c);	// Entered READY state
 
@@ -365,10 +363,9 @@ static int i2o_iop_enable(struct i2o_controller *c)
 
 	/* How long of a timeout do we need? */
 	if ((rc = i2o_msg_post_wait(c, m, 240)))
-		printk(KERN_ERR "%s: Could not enable (status=%#x).\n",
-		       c->name, -rc);
+		osm_err("%s: Could not enable (status=%#x).\n", c->name, -rc);
 	else
-		pr_debug("%s: Enabled.\n", c->name);
+		osm_debug("%s: Enabled.\n", c->name);
 
 	i2o_status_get(c);	// entered OPERATIONAL state
 
@@ -432,10 +429,9 @@ static int i2o_iop_clear(struct i2o_controller *c)
 	       &msg->u.head[1]);
 
 	if ((rc = i2o_msg_post_wait(c, m, 30)))
-		printk(KERN_INFO "%s: Unable to clear (status=%#x).\n",
-		       c->name, -rc);
+		osm_info("%s: Unable to clear (status=%#x).\n", c->name, -rc);
 	else
-		pr_debug("%s: Cleared.\n", c->name);
+		osm_debug("%s: Cleared.\n", c->name);
 
 	/* Enable all IOPs */
 	i2o_iop_enable_all();
@@ -570,14 +566,13 @@ static int i2o_iop_reset(struct i2o_controller *c)
 		 * can't read one in the given ammount of time, we assume the
 		 * IOP could not reboot properly.
 		 */
-		pr_debug("%s: Reset in progress, waiting for reboot...\n",
-			 c->name);
+		osm_debug("%s: Reset in progress, waiting for reboot...\n",
+			  c->name);
 
 		m = i2o_msg_get_wait(c, &msg, I2O_TIMEOUT_RESET);
 		while (m == I2O_QUEUE_EMPTY) {
 			if (time_after(jiffies, timeout)) {
-				printk(KERN_ERR "%s: IOP reset timeout.\n",
-				       c->name);
+				osm_err("%s: IOP reset timeout.\n", c->name);
 				rc = -ETIMEDOUT;
 				goto exit;
 			}
@@ -635,29 +630,29 @@ static int i2o_iop_activate(struct i2o_controller *c)
 
 	rc = i2o_status_get(c);
 	if (rc) {
-		printk(KERN_INFO "%s: Unable to obtain status, "
-		       "attempting a reset.\n", c->name);
+		osm_info("%s: Unable to obtain status, attempting a reset.\n",
+			 c->name);
 		rc = i2o_iop_reset(c);
 		if (rc)
 			return rc;
 	}
 
 	if (sb->i2o_version > I2OVER15) {
-		printk(KERN_ERR "%s: Not running version 1.5 of the I2O "
-		       "Specification.\n", c->name);
+		osm_err("%s: Not running version 1.5 of the I2O Specification."
+			"\n", c->name);
 		return -ENODEV;
 	}
 
 	switch (sb->iop_state) {
 	case ADAPTER_STATE_FAULTED:
-		printk(KERN_CRIT "%s: hardware fault\n", c->name);
+		osm_err("%s: hardware fault\n", c->name);
 		return -EFAULT;
 
 	case ADAPTER_STATE_READY:
 	case ADAPTER_STATE_OPERATIONAL:
 	case ADAPTER_STATE_HOLD:
 	case ADAPTER_STATE_FAILED:
-		pr_debug("%s: already running, trying to reset...\n", c->name);
+		osm_debug("%s: already running, trying to reset...\n", c->name);
 		rc = i2o_iop_reset(c);
 		if (rc)
 			return rc;
@@ -707,20 +702,18 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 		res->flags = IORESOURCE_MEM;
 		res->start = 0;
 		res->end = 0;
-		printk(KERN_INFO "%s: requires private memory resources.\n",
-		       c->name);
+		osm_info("%s: requires private memory resources.\n", c->name);
 		root = pci_find_parent_resource(c->pdev, res);
 		if (root == NULL)
-			printk(KERN_WARNING "%s: Can't find parent resource!\n",
-			       c->name);
+			osm_warn("%s: Can't find parent resource!\n", c->name);
 		if (root && allocate_resource(root, res, sb->desired_mem_size, sb->desired_mem_size, sb->desired_mem_size, 1 << 20,	/* Unspecified, so use 1Mb and play safe */
 					      NULL, NULL) >= 0) {
 			c->mem_alloc = 1;
 			sb->current_mem_size = 1 + res->end - res->start;
 			sb->current_mem_base = res->start;
-			printk(KERN_INFO "%s: allocated %ld bytes of PCI memory"
-			       " at 0x%08lX.\n", c->name,
-			       1 + res->end - res->start, res->start);
+			osm_info("%s: allocated %ld bytes of PCI memory at "
+				 "0x%08lX.\n", c->name,
+				 1 + res->end - res->start, res->start);
 		}
 	}
 
@@ -730,20 +723,18 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 		res->flags = IORESOURCE_IO;
 		res->start = 0;
 		res->end = 0;
-		printk(KERN_INFO "%s: requires private memory resources.\n",
-		       c->name);
+		osm_info("%s: requires private memory resources.\n", c->name);
 		root = pci_find_parent_resource(c->pdev, res);
 		if (root == NULL)
-			printk(KERN_WARNING "%s: Can't find parent resource!\n",
-			       c->name);
+			osm_warn("%s: Can't find parent resource!\n", c->name);
 		if (root && allocate_resource(root, res, sb->desired_io_size, sb->desired_io_size, sb->desired_io_size, 1 << 20,	/* Unspecified, so use 1Mb and play safe */
 					      NULL, NULL) >= 0) {
 			c->io_alloc = 1;
 			sb->current_io_size = 1 + res->end - res->start;
 			sb->current_mem_base = res->start;
-			printk(KERN_INFO "%s: allocated %ld bytes of PCI I/O at"
-			       " 0x%08lX.\n", c->name,
-			       1 + res->end - res->start, res->start);
+			osm_info("%s: allocated %ld bytes of PCI I/O at 0x%08lX"
+				 ".\n", c->name, 1 + res->end - res->start,
+				 res->start);
 		}
 	}
 
@@ -787,10 +778,10 @@ static int i2o_iop_systab_set(struct i2o_controller *c)
 			 PCI_DMA_TODEVICE);
 
 	if (rc < 0)
-		printk(KERN_ERR "%s: Unable to set SysTab (status=%#x).\n",
-		       c->name, -rc);
+		osm_err("%s: Unable to set SysTab (status=%#x).\n", c->name,
+			-rc);
 	else
-		pr_debug("%s: SysTab set.\n", c->name);
+		osm_debug("%s: SysTab set.\n", c->name);
 
 	i2o_status_get(c);	// Entered READY state
 
@@ -814,7 +805,7 @@ static int i2o_iop_online(struct i2o_controller *c)
 		return rc;
 
 	/* In READY state */
-	pr_debug("%s: Attempting to enable...\n", c->name);
+	osm_debug("%s: Attempting to enable...\n", c->name);
 	rc = i2o_iop_enable(c);
 	if (rc)
 		return rc;
@@ -833,7 +824,7 @@ void i2o_iop_remove(struct i2o_controller *c)
 {
 	struct i2o_device *dev, *tmp;
 
-	pr_debug("%s: deleting controller\n", c->name);
+	osm_debug("%s: deleting controller\n", c->name);
 
 	i2o_driver_notify_controller_remove_all(c);
 
@@ -882,8 +873,7 @@ static int i2o_systab_build(void)
 
 	systab = i2o_systab.virt = kmalloc(i2o_systab.len, GFP_KERNEL);
 	if (!systab) {
-		printk(KERN_ERR "i2o: unable to allocate memory for System "
-		       "Table\n");
+		osm_err("unable to allocate memory for System Table\n");
 		return -ENOMEM;
 	}
 	memset(systab, 0, i2o_systab.len);
@@ -895,8 +885,8 @@ static int i2o_systab_build(void)
 		i2o_status_block *sb;
 
 		if (count >= num_controllers) {
-			printk(KERN_ERR "i2o: controller added while building "
-			       "system table\n");
+			osm_err("controller added while building system table"
+				"\n");
 			break;
 		}
 
@@ -910,9 +900,8 @@ static int i2o_systab_build(void)
 		 * it is techninically not part of the I2O subsystem...
 		 */
 		if (unlikely(i2o_status_get(c))) {
-			printk(KERN_ERR "%s: Deleting b/c could not get status"
-			       " while attempting to build system table\n",
-			       c->name);
+			osm_err("%s: Deleting b/c could not get status while "
+				"attempting to build system table\n", c->name);
 			i2o_iop_remove(c);
 			continue;	// try the next one
 		}
@@ -994,7 +983,7 @@ int i2o_status_get(struct i2o_controller *c)
 	timeout = jiffies + I2O_TIMEOUT_STATUS_GET * HZ;
 	while (status_block[87] != 0xFF) {
 		if (time_after(jiffies, timeout)) {
-			printk(KERN_ERR "%s: Get status timeout.\n", c->name);
+			osm_err("%s: Get status timeout.\n", c->name);
 			return -ETIMEDOUT;
 		}
 
@@ -1043,8 +1032,8 @@ static int i2o_hrt_get(struct i2o_controller *c)
 		rc = i2o_msg_post_wait_mem(c, m, 20, &c->hrt);
 
 		if (rc < 0) {
-			printk(KERN_ERR "%s: Unable to get HRT (status=%#x)\n",
-			       c->name, -rc);
+			osm_err("%s: Unable to get HRT (status=%#x)\n", c->name,
+				-rc);
 			return rc;
 		}
 
@@ -1058,8 +1047,8 @@ static int i2o_hrt_get(struct i2o_controller *c)
 			return i2o_parse_hrt(c);
 	}
 
-	printk(KERN_ERR "%s: Unable to get HRT after %d tries, giving up\n",
-	       c->name, I2O_HRT_GET_TRIES);
+	osm_err("%s: Unable to get HRT after %d tries, giving up\n", c->name,
+		I2O_HRT_GET_TRIES);
 
 	return -EBUSY;
 }
@@ -1073,7 +1062,6 @@ void i2o_iop_free(struct i2o_controller *c)
 	kfree(c);
 };
 
-
 /**
  *	i2o_iop_release - release the memory for a I2O controller
  *	@dev: I2O controller which should be released
@@ -1109,8 +1097,8 @@ struct i2o_controller *i2o_iop_alloc(void)
 
 	c = kmalloc(sizeof(*c), GFP_KERNEL);
 	if (!c) {
-		printk(KERN_ERR "i2o: Insufficient memory to allocate a I2O "
-		       "controller.\n");
+		osm_err("i2o: Insufficient memory to allocate a I2O controller."
+			"\n");
 		return ERR_PTR(-ENOMEM);
 	}
 	memset(c, 0, sizeof(*c));
diff --git a/drivers/message/i2o/pci.c b/drivers/message/i2o/pci.c
index 442e34506b90..9971430e5184 100644
--- a/drivers/message/i2o/pci.c
+++ b/drivers/message/i2o/pci.c
@@ -179,7 +179,10 @@ static int __devinit i2o_pci_alloc(struct i2o_controller *c)
 		return -ENOMEM;
 	}
 
-	if (i2o_dma_alloc(dev, &c->out_queue, MSG_POOL_SIZE, GFP_KERNEL)) {
+	if (i2o_dma_alloc
+	    (dev, &c->out_queue,
+	     I2O_MAX_OUTBOUND_MSG_FRAMES * I2O_OUTBOUND_MSG_FRAME_SIZE *
+	     sizeof(u32), GFP_KERNEL)) {
 		i2o_pci_free(c);
 		return -ENOMEM;
 	}
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index be937d0372a7..bdc286ec947c 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -156,8 +156,8 @@ struct i2o_controller {
 
 	struct pci_dev *pdev;	/* PCI device */
 
-	unsigned int promise:1;		/* Promise controller */
-	unsigned int adaptec:1;		/* DPT / Adaptec controller */
+	unsigned int promise:1;	/* Promise controller */
+	unsigned int adaptec:1;	/* DPT / Adaptec controller */
 	unsigned int raptor:1;	/* split bar */
 	unsigned int no_quiesce:1;	/* dont quiesce before reset */
 	unsigned int short_req:1;	/* use small block sizes */
@@ -174,7 +174,7 @@ struct i2o_controller {
 
 	/* Dynamic LCT related data */
 
-	struct i2o_dma status;	/* status of IOP */
+	struct i2o_dma status;	/* IOP status block */
 
 	struct i2o_dma hrt;	/* HW Resource Table */
 	i2o_lct *lct;		/* Logical Config Table */
@@ -186,7 +186,7 @@ struct i2o_controller {
 	struct i2o_io in_queue;	/* inbound message queue Host->IOP */
 	struct i2o_dma out_queue;	/* outbound message queue IOP->Host */
 
-	unsigned int battery:1;		/* Has a battery backup */
+	unsigned int battery:1;	/* Has a battery backup */
 	unsigned int io_alloc:1;	/* An I/O resource was allocated */
 	unsigned int mem_alloc:1;	/* A memory resource was allocated */
 
-- 
cgit v1.2.3-59-g8ed1b


From 496400014f22c4dbdbc1e89249a2feba46939708 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:02:58 -0700
Subject: [PATCH] nfsd4: fix fh_expire_type

We're returning NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME for the
fh_expire_type attribute.  This is incorrect:
	1. The spec actually only allows NOEXPIRE_WITH_OPEN when
	   VOLATILE_ANY is also set.
	2. Filehandles for open files can expire, if the file is removed
	   and there is a reboot.
	3. Filehandles are only volatile on rename in the nosubtree check
	   case.

Unfortunately, there's no way to indicate that we only expire on remove.  So
our only choice is FH4_VOLATILE_ANY.  Although it's redundant, we also set
FH4_VOL_RENAME in the subtree check case, since subtreecheck does actually
cause problems in practice and it seems possibly useful to give clients some
way to distinguish that case.

Fix a mispelled #define while we're at it.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4xdr.c    | 5 ++++-
 include/linux/nfs4.h | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 36a058a112d5..0ae1467c3bc3 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1366,7 +1366,10 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 	if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) {
 		if ((buflen -= 4) < 0)
 			goto out_resource;
-		WRITE32( NFS4_FH_NOEXPIRE_WITH_OPEN | NFS4_FH_VOL_RENAME );
+		if (exp->ex_flags & NFSEXP_NOSUBTREECHECK)
+			WRITE32(NFS4_FH_VOLATILE_ANY);
+		else
+			WRITE32(NFS4_FH_VOLATILE_ANY|NFS4_FH_VOL_RENAME);
 	}
 	if (bmval0 & FATTR4_WORD0_CHANGE) {
 		/*
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index 5bb5b2fd7ba2..0c1c306cdaec 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -28,7 +28,7 @@
 #define NFS4_ACCESS_DELETE      0x0010
 #define NFS4_ACCESS_EXECUTE     0x0020
 
-#define NFS4_FH_PERISTENT		0x0000
+#define NFS4_FH_PERSISTENT		0x0000
 #define NFS4_FH_NOEXPIRE_WITH_OPEN	0x0001
 #define NFS4_FH_VOLATILE_ANY		0x0002
 #define NFS4_FH_VOL_MIGRATION		0x0004
-- 
cgit v1.2.3-59-g8ed1b


From 8beefa249371f55432394ac96864c83b0b309c28 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:08 -0700
Subject: [PATCH] nfsd4: rename nfs4_file fields

Trivial renaming patch:

I can never remember, while looking at various lists relating the nfsd4 state
structures, which are the "heads" and which are items on other lists, or which
structures are actually on the various lists.  The following convention helps
me: given structures foo and bar, with foo containing the head of a list of
bars, use "bars" for the name of the head of the list contained in the struct
foo, and use "per_foo" for the entries in the struct bars.

Go ahead and do this for struct nfs4_file.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c        | 22 +++++++++++-----------
 include/linux/nfsd/state.h |  4 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index f03a4180fa11..a84a80e8c0cf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -153,7 +153,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 		        current_fh->fh_handle.fh_size);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
-	list_add(&dp->dl_del_perfile, &fp->fi_del_perfile);
+	list_add(&dp->dl_del_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt);
 	return dp;
 }
@@ -954,8 +954,8 @@ alloc_init_file(struct inode *ino)
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
 		INIT_LIST_HEAD(&fp->fi_hash);
-		INIT_LIST_HEAD(&fp->fi_perfile);
-		INIT_LIST_HEAD(&fp->fi_del_perfile);
+		INIT_LIST_HEAD(&fp->fi_stateids);
+		INIT_LIST_HEAD(&fp->fi_delegations);
 		list_add(&fp->fi_hash, &file_hashtbl[hashval]);
 		fp->fi_inode = igrab(ino);
 		fp->fi_id = current_fileid++;
@@ -974,7 +974,7 @@ release_all_files(void)
 		while (!list_empty(&file_hashtbl[i])) {
 			fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash);
 			/* this should never be more than once... */
-			if (!list_empty(&fp->fi_perfile) || !list_empty(&fp->fi_del_perfile)) {
+			if (!list_empty(&fp->fi_stateids) || !list_empty(&fp->fi_delegations)) {
 				printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp);
 			}
 			release_file(fp);
@@ -1139,7 +1139,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	INIT_LIST_HEAD(&stp->st_perfile);
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
 	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
-	list_add(&stp->st_perfile, &fp->fi_perfile);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
 	stp->st_stateowner = sop;
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
@@ -1204,7 +1204,7 @@ release_state_owner(struct nfs4_stateid *stp, int flag)
 	if (sop->so_confirmed && list_empty(&sop->so_perfilestate))
 		move_to_close_lru(sop);
 	/* unused nfs4_file's are releseed. XXX slab cache? */
-	if (list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile)) {
+	if (list_empty(&fp->fi_stateids) && list_empty(&fp->fi_delegations)) {
 		release_file(fp);
 	}
 }
@@ -1294,7 +1294,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	fp = find_file(ino);
 	if (fp) {
 	/* Search for conflicting share reservations */
-		list_for_each_entry(stp, &fp->fi_perfile, st_perfile) {
+		list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
 			if (test_bit(deny_type, &stp->st_deny_bmap) ||
 			    test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
 				return nfserr_share_denied;
@@ -1545,7 +1545,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
 	struct nfs4_delegation *dp;
 
-	list_for_each_entry(dp, &fp->fi_del_perfile, dl_del_perfile) {
+	list_for_each_entry(dp, &fp->fi_delegations, dl_del_perfile) {
 		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
 			return dp;
 	}
@@ -1583,7 +1583,7 @@ nfs4_check_open(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_state
 	int status = nfserr_share_denied;
 	struct nfs4_stateowner *sop = open->op_stateowner;
 
-	list_for_each_entry(local, &fp->fi_perfile, st_perfile) {
+	list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
 		/* ignore lock owners */
 		if (local->st_stateowner->so_is_open_owner == 0)
 			continue;
@@ -1830,7 +1830,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	            stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
 out:
 	/* take the opportunity to clean up unused state */
-	if (fp && list_empty(&fp->fi_perfile) && list_empty(&fp->fi_del_perfile))
+	if (fp && list_empty(&fp->fi_stateids) && list_empty(&fp->fi_delegations))
 		release_file(fp);
 
 	/* CLAIM_PREVIOUS has different error returns */
@@ -2633,7 +2633,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	INIT_LIST_HEAD(&stp->st_perfilestate);
 	INIT_LIST_HEAD(&stp->st_perlockowner); /* not used */
 	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
-	list_add(&stp->st_perfile, &fp->fi_perfile);
+	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
 	stp->st_stateowner = sop;
 	stp->st_file = fp;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index b6b2fe1e7c63..2c3b42674a4c 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -218,8 +218,8 @@ struct nfs4_stateowner {
 */
 struct nfs4_file {
 	struct list_head        fi_hash;    /* hash by "struct inode *" */
-	struct list_head        fi_perfile; /* list: nfs4_stateid */
-	struct list_head	fi_del_perfile; /* list: nfs4_delegation */
+	struct list_head        fi_stateids;
+	struct list_head	fi_delegations;
 	struct inode		*fi_inode;
 	u32                     fi_id;      /* used with stateowner->so_id 
 					     * for stateid_hashtbl hash */
-- 
cgit v1.2.3-59-g8ed1b


From 13cd21845d6a9729ca95e36ae6e8c669623fbfd4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:10 -0700
Subject: [PATCH] nfsd4: reference count struct nfs4_file

Add a struct kref to each nfs4_file and take a reference to it from each
stateid and delegation that refers to it.  The atomicity guarantees are
overkill given that all this stuff is done under the single nfsd4 state lock,
but a) we'd like finer-grained locking some day, and b) this simplifies the
cleanup of the structures a bit, something that has previously been a bit
complicated and bug-prone.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c        | 100 +++++++++++++++++++++++----------------------
 include/linux/nfsd/state.h |   1 +
 2 files changed, 52 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index a84a80e8c0cf..6ba428afa433 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -111,7 +111,6 @@ opaque_hashval(const void *ptr, int nbytes)
 /* forward declarations */
 static void release_stateowner(struct nfs4_stateowner *sop);
 static void release_stateid(struct nfs4_stateid *stp, int flags);
-static void release_file(struct nfs4_file *fp);
 
 /*
  * Delegation state
@@ -121,6 +120,27 @@ static void release_file(struct nfs4_file *fp);
 spinlock_t recall_lock;
 static struct list_head del_recall_lru;
 
+static void
+free_nfs4_file(struct kref *kref)
+{
+	struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
+	list_del(&fp->fi_hash);
+	iput(fp->fi_inode);
+	kmem_cache_free(file_slab, fp);
+}
+
+static inline void
+put_nfs4_file(struct nfs4_file *fi)
+{
+	kref_put(&fi->fi_ref, free_nfs4_file);
+}
+
+static inline void
+get_nfs4_file(struct nfs4_file *fi)
+{
+	kref_get(&fi->fi_ref);
+}
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -136,6 +156,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	INIT_LIST_HEAD(&dp->dl_del_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
 	dp->dl_client = clp;
+	get_nfs4_file(fp);
 	dp->dl_file = fp;
 	dp->dl_flock = NULL;
 	get_file(stp->st_vfs_file);
@@ -163,6 +184,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 {
 	if (atomic_dec_and_test(&dp->dl_count)) {
 		dprintk("NFSD: freeing dp %p\n",dp);
+		put_nfs4_file(dp->dl_file);
 		kmem_cache_free(deleg_slab, dp);
 	}
 }
@@ -953,6 +975,7 @@ alloc_init_file(struct inode *ino)
 
 	fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
 	if (fp) {
+		kref_init(&fp->fi_ref);
 		INIT_LIST_HEAD(&fp->fi_hash);
 		INIT_LIST_HEAD(&fp->fi_stateids);
 		INIT_LIST_HEAD(&fp->fi_delegations);
@@ -964,24 +987,6 @@ alloc_init_file(struct inode *ino)
 	return NULL;
 }
 
-static void
-release_all_files(void)
-{
-	int i;
-	struct nfs4_file *fp;
-
-	for (i=0;i<FILE_HASH_SIZE;i++) {
-		while (!list_empty(&file_hashtbl[i])) {
-			fp = list_entry(file_hashtbl[i].next, struct nfs4_file, fi_hash);
-			/* this should never be more than once... */
-			if (!list_empty(&fp->fi_stateids) || !list_empty(&fp->fi_delegations)) {
-				printk("ERROR: release_all_files: file %p is open, creating dangling state !!!\n",fp);
-			}
-			release_file(fp);
-		}
-	}
-}
-
 static void
 nfsd4_free_slab(kmem_cache_t **slab)
 {
@@ -1141,6 +1146,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	stp->st_stateowner = sop;
+	get_nfs4_file(fp);
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
@@ -1166,18 +1172,11 @@ release_stateid(struct nfs4_stateid *stp, int flags)
 		nfsd_close(filp);
 	} else if (flags & LOCK_STATE)
 		locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
+	put_nfs4_file(stp->st_file);
 	kmem_cache_free(stateid_slab, stp);
 	stp = NULL;
 }
 
-static void
-release_file(struct nfs4_file *fp)
-{
-	list_del(&fp->fi_hash);
-	iput(fp->fi_inode);
-	kmem_cache_free(file_slab, fp);
-}	
-
 void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
@@ -1192,7 +1191,6 @@ void
 release_state_owner(struct nfs4_stateid *stp, int flag)
 {
 	struct nfs4_stateowner *sop = stp->st_stateowner;
-	struct nfs4_file *fp = stp->st_file;
 
 	dprintk("NFSD: release_state_owner\n");
 	release_stateid(stp, flag);
@@ -1203,10 +1201,6 @@ release_state_owner(struct nfs4_stateid *stp, int flag)
 	 */
 	if (sop->so_confirmed && list_empty(&sop->so_perfilestate))
 		move_to_close_lru(sop);
-	/* unused nfs4_file's are releseed. XXX slab cache? */
-	if (list_empty(&fp->fi_stateids) && list_empty(&fp->fi_delegations)) {
-		release_file(fp);
-	}
 }
 
 static int
@@ -1236,8 +1230,10 @@ find_file(struct inode *ino)
 	struct nfs4_file *fp;
 
 	list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
-		if (fp->fi_inode == ino)
+		if (fp->fi_inode == ino) {
+			get_nfs4_file(fp);
 			return fp;
+		}
 	}
 	return NULL;
 }
@@ -1288,19 +1284,24 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	struct inode *ino = current_fh->fh_dentry->d_inode;
 	struct nfs4_file *fp;
 	struct nfs4_stateid *stp;
+	int ret;
 
 	dprintk("NFSD: nfs4_share_conflict\n");
 
 	fp = find_file(ino);
-	if (fp) {
+	if (!fp)
+		return nfs_ok;
+	ret = nfserr_share_denied;
 	/* Search for conflicting share reservations */
-		list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
-			if (test_bit(deny_type, &stp->st_deny_bmap) ||
-			    test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
-				return nfserr_share_denied;
-		}
+	list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
+		if (test_bit(deny_type, &stp->st_deny_bmap) ||
+		    test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
+			goto out;
 	}
-	return nfs_ok;
+	ret = nfs_ok;
+out:
+	put_nfs4_file(fp);
+	return ret;
 }
 
 static inline void
@@ -1829,10 +1830,8 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	            stp->st_stateid.si_boot, stp->st_stateid.si_stateownerid,
 	            stp->st_stateid.si_fileid, stp->st_stateid.si_generation);
 out:
-	/* take the opportunity to clean up unused state */
-	if (fp && list_empty(&fp->fi_stateids) && list_empty(&fp->fi_delegations))
-		release_file(fp);
-
+	if (fp)
+		put_nfs4_file(fp);
 	/* CLAIM_PREVIOUS has different error returns */
 	nfs4_set_claim_prev(open, &status);
 	/*
@@ -2480,16 +2479,19 @@ find_stateid(stateid_t *stid, int flags)
 static struct nfs4_delegation *
 find_delegation_stateid(struct inode *ino, stateid_t *stid)
 {
-	struct nfs4_file *fp = NULL;
+	struct nfs4_file *fp;
+	struct nfs4_delegation *dl;
 
 	dprintk("NFSD:find_delegation_stateid stateid=(%08x/%08x/%08x/%08x)\n",
                     stid->si_boot, stid->si_stateownerid,
                     stid->si_fileid, stid->si_generation);
 
 	fp = find_file(ino);
-	if (fp)
-		return find_delegation_file(fp, stid);
-	return NULL;
+	if (!fp)
+		return NULL;
+	dl = find_delegation_file(fp, stid);
+	put_nfs4_file(fp);
+	return dl;
 }
 
 /*
@@ -2636,6 +2638,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
 	stp->st_stateowner = sop;
+	get_nfs4_file(fp);
 	stp->st_file = fp;
 	stp->st_stateid.si_boot = boot_time;
 	stp->st_stateid.si_stateownerid = sop->so_id;
@@ -3287,7 +3290,6 @@ __nfs4_state_shutdown(void)
 		unhash_delegation(dp);
 	}
 
-	release_all_files();
 	cancel_delayed_work(&laundromat_work);
 	flush_scheduled_work();
 	nfs4_init = 0;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 2c3b42674a4c..296e6429fc3b 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -217,6 +217,7 @@ struct nfs4_stateowner {
 *      share_acces, share_deny on the file.
 */
 struct nfs4_file {
+	struct kref		fi_ref;
 	struct list_head        fi_hash;    /* hash by "struct inode *" */
 	struct list_head        fi_stateids;
 	struct list_head	fi_delegations;
-- 
cgit v1.2.3-59-g8ed1b


From 7b190fecfa33d72bcf74c9473134c2ad14ae9545 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:23 -0700
Subject: [PATCH] knfsd: nfsd4: delegation recovery

Allow recovery of delegations after reboot.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 36 ++++++++++++++++++++++++++++--------
 fs/nfsd/nfs4xdr.c         |  2 +-
 include/linux/nfsd/xdr4.h |  1 +
 3 files changed, 30 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 16c9a43218c3..0f6119714c8c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1709,14 +1709,30 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	int status, flag = 0;
 
 	flag = NFS4_OPEN_DELEGATE_NONE;
-	if (open->op_claim_type != NFS4_OPEN_CLAIM_NULL
-	     || !atomic_read(&cb->cb_set) || !sop->so_confirmed)
-		goto out;
-
-	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
-		flag = NFS4_OPEN_DELEGATE_WRITE;
-	else
-		flag = NFS4_OPEN_DELEGATE_READ;
+	open->op_recall = 0;
+	switch (open->op_claim_type) {
+		case NFS4_OPEN_CLAIM_PREVIOUS:
+			if (!atomic_read(&cb->cb_set))
+				open->op_recall = 1;
+			flag = open->op_delegate_type;
+			if (flag == NFS4_OPEN_DELEGATE_NONE)
+				goto out;
+			break;
+		case NFS4_OPEN_CLAIM_NULL:
+			/* Let's not give out any delegations till everyone's
+			 * had the chance to reclaim theirs.... */
+			if (nfs4_in_grace())
+				goto out;
+			if (!atomic_read(&cb->cb_set) || !sop->so_confirmed)
+				goto out;
+			if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+				flag = NFS4_OPEN_DELEGATE_WRITE;
+			else
+				flag = NFS4_OPEN_DELEGATE_READ;
+			break;
+		default:
+			goto out;
+	}
 
 	dp = alloc_init_deleg(sop->so_client, stp, fh, flag);
 	if (dp == NULL) {
@@ -1750,6 +1766,10 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta
 	             dp->dl_stateid.si_fileid,
 	             dp->dl_stateid.si_generation);
 out:
+	if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS
+			&& flag == NFS4_OPEN_DELEGATE_NONE
+			&& open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE)
+		printk("NFSD: WARNING: refusing delegation reclaim\n");
 	open->op_delegate_type = flag;
 }
 
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0ae1467c3bc3..cfe978a72cea 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1972,7 +1972,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_open
 	case NFS4_OPEN_DELEGATE_READ:
 		RESERVE_SPACE(20 + sizeof(stateid_t));
 		WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t));
-		WRITE32(0);
+		WRITE32(open->op_recall);
 
 		/*
 		 * TODO: ACE's in delegations
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index a1f5ad0be1bf..4d24d65c0e88 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -210,6 +210,7 @@ struct nfsd4_open {
 	u32		op_share_access;    /* request */
 	u32		op_share_deny;      /* request */
 	stateid_t	op_stateid;         /* response */
+	u32		op_recall;          /* recall */
 	struct nfsd4_change_info  op_cinfo; /* response */
 	u32		op_rflags;          /* response */
 	int		op_truncate;        /* used during processing */
-- 
cgit v1.2.3-59-g8ed1b


From 76a3550ec50ed86885a10a767ebaebb7c9104721 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:26 -0700
Subject: [PATCH] knfsd: nfsd4: rename nfs4_state_init

Somewhat gratuitous rename to simplify following patch.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 6 +++---
 fs/nfsd/nfssvc.c          | 2 +-
 include/linux/nfsd/nfsd.h | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f6119714c8c..e00b3472851c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3185,7 +3185,7 @@ nfs4_check_open_reclaim(clientid_t *clid)
  */
 
 static void
-__nfs4_state_init(void)
+__nfs4_state_start(void)
 {
 	int i;
 	time_t grace_time;
@@ -3235,7 +3235,7 @@ __nfs4_state_init(void)
 }
 
 int
-nfs4_state_init(void)
+nfs4_state_start(void)
 {
 	int status;
 
@@ -3244,7 +3244,7 @@ nfs4_state_init(void)
 	status = nfsd4_init_slabs();
 	if (status)
 		return status;
-	__nfs4_state_init();
+	__nfs4_state_start();
 	nfs4_init = 1;
 	return 0;
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 904df604e86b..07b9a065e9da 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -95,7 +95,7 @@ nfsd_svc(unsigned short port, int nrservs)
 	error =	nfsd_racache_init(2*nrservs);
 	if (error<0)
 		goto out;
-	error = nfs4_state_init();
+	error = nfs4_state_start();
 	if (error<0)
 		goto out;
 	if (!nfsd_serv) {
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 4bf931d5ff56..3855fdc5af77 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -145,12 +145,12 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
  * NFSv4 State
  */
 #ifdef CONFIG_NFSD_V4
-int nfs4_state_init(void);
+int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 #else
-static inline int nfs4_state_init(void){return 0;}
+static inline int nfs4_state_start(void){return 0;}
 static inline void nfs4_state_shutdown(void){}
 static inline time_t nfs4_lease_time(void){return 0;}
 static inline void nfs4_reset_lease(time_t leasetime){}
-- 
cgit v1.2.3-59-g8ed1b


From ac4d8ff2a57179de3ef7834c6ab3fac430b0a05d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:30 -0700
Subject: [PATCH] knfsd: nfsd4: clean up state initialization

Separate out stuff that needs initialization on startup from stuff that only
needs initialization on module init from static data.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 35 +++++++++++++++++------------------
 fs/nfsd/nfsctl.c          |  1 +
 include/linux/nfsd/nfsd.h |  2 ++
 3 files changed, 20 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index e00b3472851c..1f68ce36e724 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -117,7 +117,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
  */
 
 /* recall_lock protects the del_recall_lru */
-spinlock_t recall_lock;
+spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
 static struct list_head del_recall_lru;
 
 static void
@@ -3179,23 +3179,13 @@ nfs4_check_open_reclaim(clientid_t *clid)
 	return nfs4_find_reclaim_client(clid) ? nfs_ok : nfserr_reclaim_bad;
 }
 
+/* initialization to perform at module load time: */
 
-/* 
- * Start and stop routines
- */
-
-static void
-__nfs4_state_start(void)
+void
+nfs4_state_init(void)
 {
 	int i;
-	time_t grace_time;
 
-	if (!nfs4_reclaim_init) {
-		for (i = 0; i < CLIENT_HASH_SIZE; i++)
-			INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
-		reclaim_str_hashtbl_size = 0;
-		nfs4_reclaim_init = 1;
-	}
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		INIT_LIST_HEAD(&conf_id_hashtbl[i]);
 		INIT_LIST_HEAD(&conf_str_hashtbl[i]);
@@ -3217,19 +3207,28 @@ __nfs4_state_start(void)
 		INIT_LIST_HEAD(&lock_ownerid_hashtbl[i]);
 		INIT_LIST_HEAD(&lock_ownerstr_hashtbl[i]);
 	}
-	memset(&zerostateid, 0, sizeof(stateid_t));
 	memset(&onestateid, ~0, sizeof(stateid_t));
-
 	INIT_LIST_HEAD(&close_lru);
 	INIT_LIST_HEAD(&client_lru);
 	INIT_LIST_HEAD(&del_recall_lru);
-	spin_lock_init(&recall_lock);
+	for (i = 0; i < CLIENT_HASH_SIZE; i++)
+		INIT_LIST_HEAD(&reclaim_str_hashtbl[i]);
+	reclaim_str_hashtbl_size = 0;
+	nfs4_reclaim_init = 1;
+}
+
+/* initialization to perform when the nfsd service is started: */
+
+static void
+__nfs4_state_start(void)
+{
+	time_t grace_time;
+
 	boot_time = get_seconds();
 	grace_time = max(user_lease_time, lease_time);
 	lease_time = user_lease_time;
 	printk("NFSD: starting %ld-second grace period\n", grace_time);
 	grace_end = boot_time + grace_time;
-	INIT_WORK(&laundromat_work,laundromat_main, NULL);
 	laundry_wq = create_singlethread_workqueue("nfsd4");
 	queue_delayed_work(laundry_wq, &laundromat_work, NFSD_LEASE_TIME*HZ);
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 161afdcb8f7d..3d56531a7a03 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -397,6 +397,7 @@ static int __init init_nfsd(void)
 	nfsd_cache_init();	/* RPC reply cache */
 	nfsd_export_init();	/* Exports table */
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
+	nfs4_state_init();	/* NFSv4 locking state */
 #ifdef CONFIG_NFSD_V4
 	nfsd_idmap_init();      /* Name to ID mapping */
 #endif /* CONFIG_NFSD_V4 */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 3855fdc5af77..21c6e9d86e4f 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -145,11 +145,13 @@ int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *);
  * NFSv4 State
  */
 #ifdef CONFIG_NFSD_V4
+void nfs4_state_init(void);
 int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
 #else
+static inline void nfs4_state_init(void){};
 static inline int nfs4_state_start(void){return 0;}
 static inline void nfs4_state_shutdown(void){}
 static inline time_t nfs4_lease_time(void){return 0;}
-- 
cgit v1.2.3-59-g8ed1b


From bd0b1e954e3ba3e5d2cab941458cf98206471bd2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:35 -0700
Subject: [PATCH] knfsd: nfsd4: idmap initialization

Adopt standard kernel style by defining a no-op function instead of putting
ifdef's in the code where the function is called.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfsctl.c           | 4 ----
 include/linux/nfsd_idmap.h | 5 +++++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d56531a7a03..3da43a3ed32c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -398,9 +398,7 @@ static int __init init_nfsd(void)
 	nfsd_export_init();	/* Exports table */
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	nfs4_state_init();	/* NFSv4 locking state */
-#ifdef CONFIG_NFSD_V4
 	nfsd_idmap_init();      /* Name to ID mapping */
-#endif /* CONFIG_NFSD_V4 */
 	if (proc_mkdir("fs/nfs", NULL)) {
 		struct proc_dir_entry *entry;
 		entry = create_proc_entry("fs/nfs/exports", 0, NULL);
@@ -427,9 +425,7 @@ static void __exit exit_nfsd(void)
 	remove_proc_entry("fs/nfs", NULL);
 	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
-#ifdef CONFIG_NFSD_V4
 	nfsd_idmap_shutdown();
-#endif /* CONFIG_NFSD_V4 */
 	unregister_filesystem(&nfsd_fs_type);
 }
 
diff --git a/include/linux/nfsd_idmap.h b/include/linux/nfsd_idmap.h
index 9bb7f30e923b..e82746fcad14 100644
--- a/include/linux/nfsd_idmap.h
+++ b/include/linux/nfsd_idmap.h
@@ -43,8 +43,13 @@
 /* XXX from linux/nfs_idmap.h */
 #define IDMAP_NAMESZ 128
 
+#ifdef CONFIG_NFSD_V4
 void nfsd_idmap_init(void);
 void nfsd_idmap_shutdown(void);
+#else
+static inline void nfsd_idmap_init(void) {};
+static inline void nfsd_idmap_shutdown(void) {};
+#endif
 
 int nfsd_map_name_to_uid(struct svc_rqst *, const char *, size_t, __u32 *);
 int nfsd_map_name_to_gid(struct svc_rqst *, const char *, size_t, __u32 *);
-- 
cgit v1.2.3-59-g8ed1b


From a55370a3c0106106a975c5a09cee800611d0cf50 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:03:52 -0700
Subject: [PATCH] knfsd: nfsd4: reboot hash

For the purposes of reboot recovery we keep a directory with subdirectories
each having a name that is the ascii hex representation of the md5 sum of a
client identifier for an active client.

This adds the code to calculate that name.  We also use it for the purposes of
comparing clients, so if someone ever manages to find two client names that
are md5 collisions, then we'll return clid_inuse to the second.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig                 |  2 +
 fs/nfsd/Makefile           |  2 +-
 fs/nfsd/nfs4recover.c      | 97 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        | 80 ++++++++++++++++++--------------------
 include/linux/nfsd/state.h |  6 ++-
 5 files changed, 143 insertions(+), 44 deletions(-)
 create mode 100644 fs/nfsd/nfs4recover.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index a7c0cc3203cb..5c704d05627a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1413,6 +1413,8 @@ config NFSD_V4
 	bool "Provide NFSv4 server support (EXPERIMENTAL)"
 	depends on NFSD_V3 && EXPERIMENTAL
 	select NFSD_TCP
+	select CRYPTO_MD5
+	select CRYPTO
 	help
 	  If you would like to include the NFSv4 server as well as the NFSv2
 	  and NFSv3 servers, say Y here.  This feature is experimental, and
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 9f043f44c92f..ce341dc76d5e 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -10,5 +10,5 @@ nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
 nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
-			   nfs4acl.o nfs4callback.o
+			   nfs4acl.o nfs4callback.o nfs4recover.o
 nfsd-objs		:= $(nfsd-y)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
new file mode 100644
index 000000000000..841a305d7948
--- /dev/null
+++ b/fs/nfsd/nfs4recover.c
@@ -0,0 +1,97 @@
+/*
+*  linux/fs/nfsd/nfs4recover.c
+*
+*  Copyright (c) 2004 The Regents of the University of Michigan.
+*  All rights reserved.
+*
+*  Andy Adamson <andros@citi.umich.edu>
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions
+*  are met:
+*
+*  1. Redistributions of source code must retain the above copyright
+*     notice, this list of conditions and the following disclaimer.
+*  2. Redistributions in binary form must reproduce the above copyright
+*     notice, this list of conditions and the following disclaimer in the
+*     documentation and/or other materials provided with the distribution.
+*  3. Neither the name of the University nor the names of its
+*     contributors may be used to endorse or promote products derived
+*     from this software without specific prior written permission.
+*
+*  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+*  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+*  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+*  DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+*  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+*  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+*  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+*  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*/
+
+
+#include <linux/sunrpc/svc.h>
+#include <linux/nfsd/nfsd.h>
+#include <linux/nfs4.h>
+#include <linux/nfsd/state.h>
+#include <linux/nfsd/xdr4.h>
+#include <asm/uaccess.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+
+
+#define NFSDDBG_FACILITY                NFSDDBG_PROC
+
+static void
+md5_to_hex(char *out, char *md5)
+{
+	int i;
+
+	for (i=0; i<16; i++) {
+		unsigned char c = md5[i];
+
+		*out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1);
+		*out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1);
+	}
+	*out = '\0';
+}
+
+int
+nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
+{
+	struct xdr_netobj cksum;
+	struct crypto_tfm *tfm;
+	struct scatterlist sg[1];
+	int status = nfserr_resource;
+
+	dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
+			clname->len, clname->data);
+	tfm = crypto_alloc_tfm("md5", 0);
+	if (tfm == NULL)
+		goto out;
+	cksum.len = crypto_tfm_alg_digestsize(tfm);
+	cksum.data = kmalloc(cksum.len, GFP_KERNEL);
+	if (cksum.data == NULL)
+ 		goto out;
+	crypto_digest_init(tfm);
+
+	sg[0].page = virt_to_page(clname->data);
+	sg[0].offset = offset_in_page(clname->data);
+	sg[0].length = clname->len;
+
+	crypto_digest_update(tfm, sg, 1);
+	crypto_digest_final(tfm, cksum.data);
+
+	md5_to_hex(dname, cksum.data);
+
+	kfree(cksum.data);
+	status = nfs_ok;
+out:
+	if (tfm)
+		crypto_free_tfm(tfm);
+	return status;
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 2a59d176e69a..0be0b37c84e9 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -231,8 +231,8 @@ unhash_delegation(struct nfs4_delegation *dp)
 
 #define clientid_hashval(id) \
 	((id) & CLIENT_HASH_MASK)
-#define clientstr_hashval(name, namelen) \
-	(opaque_hashval((name), (namelen)) & CLIENT_HASH_MASK)
+#define clientstr_hashval(name) \
+	(opaque_hashval((name), 8) & CLIENT_HASH_MASK)
 /*
  * reclaim_str_hashtbl[] holds known client info from previous reset/reboot
  * used in reboot/reset lease grace period processing
@@ -366,11 +366,12 @@ expire_client(struct nfs4_client *clp)
 }
 
 static struct nfs4_client *
-create_client(struct xdr_netobj name) {
+create_client(struct xdr_netobj name, char *recdir) {
 	struct nfs4_client *clp;
 
 	if (!(clp = alloc_client(name)))
 		goto out;
+	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_count, 1);
 	atomic_set(&clp->cl_callback.cb_set, 0);
 	clp->cl_callback.cb_parsed = 0;
@@ -403,11 +404,9 @@ copy_cred(struct svc_cred *target, struct svc_cred *source) {
 	get_group_info(target->cr_group_info);
 }
 
-static int
-cmp_name(struct xdr_netobj *n1, struct xdr_netobj *n2) {
-	if (!n1 || !n2)
-		return 0;
-	return((n1->len == n2->len) && !memcmp(n1->data, n2->data, n2->len));
+static inline int
+same_name(const char *n1, const char *n2) {
+	return 0 == memcmp(n1, n2, HEXDIR_LEN);
 }
 
 static int
@@ -479,8 +478,7 @@ move_to_confirmed(struct nfs4_client *clp)
 	list_del_init(&clp->cl_strhash);
 	list_del_init(&clp->cl_idhash);
 	list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
-	strhashval = clientstr_hashval(clp->cl_name.data, 
-			clp->cl_name.len);
+	strhashval = clientstr_hashval(clp->cl_recdir);
 	list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
 	renew_client(clp);
 }
@@ -651,22 +649,27 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 	unsigned int 		strhashval;
 	struct nfs4_client *	conf, * unconf, * new, * clp;
 	int 			status;
+	char                    dname[HEXDIR_LEN];
 	
 	status = nfserr_inval;
 	if (!check_name(clname))
 		goto out;
 
+	status = nfs4_make_rec_clidname(dname, &clname);
+	if (status)
+		goto out;
+
 	/* 
 	 * XXX The Duplicate Request Cache (DRC) has been checked (??)
 	 * We get here on a DRC miss.
 	 */
 
-	strhashval = clientstr_hashval(clname.data, clname.len);
+	strhashval = clientstr_hashval(dname);
 
 	conf = NULL;
 	nfs4_lock_state();
 	list_for_each_entry(clp, &conf_str_hashtbl[strhashval], cl_strhash) {
-		if (!cmp_name(&clp->cl_name, &clname))
+		if (!same_name(clp->cl_recdir, dname))
 			continue;
 		/* 
 		 * CASE 0:
@@ -686,7 +689,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 	}
 	unconf = NULL;
 	list_for_each_entry(clp, &unconf_str_hashtbl[strhashval], cl_strhash) {
-		if (!cmp_name(&clp->cl_name, &clname))
+		if (!same_name(clp->cl_recdir, dname))
 			continue;
 		/* cl_name match from a previous SETCLIENTID operation */
 		unconf = clp;
@@ -700,7 +703,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 		 */
 		if (unconf)
 			expire_client(unconf);
-		if (!(new = create_client(clname)))
+		new = create_client(clname, dname);
+		if (new == NULL)
 			goto out;
 		copy_verf(new, &clverifier);
 		new->cl_addr = ip_addr;
@@ -728,7 +732,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 		     cmp_clid(&unconf->cl_clientid, &conf->cl_clientid)) {
 				expire_client(unconf);
 		}
-		if (!(new = create_client(clname)))
+		new = create_client(clname, dname);
+		if (new == NULL)
 			goto out;
 		copy_verf(new,&conf->cl_verifier);
 		new->cl_addr = ip_addr;
@@ -746,7 +751,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 		 * using input clverifier, clname, and callback info
 		 * and generate a new cl_clientid and cl_confirm.
 		 */
-		if (!(new = create_client(clname)))
+		new = create_client(clname, dname);
+		if (new == NULL)
 			goto out;
 		copy_verf(new,&clverifier);
 		new->cl_addr = ip_addr;
@@ -772,7 +778,8 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_setclientid *setclid)
 		 * new cl_verifier and a new cl_confirm
 		 */
 		expire_client(unconf);
-		if (!(new = create_client(clname)))
+		new = create_client(clname, dname);
+		if (new == NULL)
 			goto out;
 		copy_verf(new,&clverifier);
 		new->cl_addr = ip_addr;
@@ -856,7 +863,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confi
 	if ((conf && unconf) && 
 	    (cmp_verf(&unconf->cl_confirm, &confirm)) &&
 	    (cmp_verf(&conf->cl_verifier, &unconf->cl_verifier)) &&
-	    (cmp_name(&conf->cl_name,&unconf->cl_name))  &&
+	    (same_name(conf->cl_recdir,unconf->cl_recdir))  &&
 	    (!cmp_verf(&conf->cl_confirm, &unconf->cl_confirm))) {
 		if (!cmp_creds(&conf->cl_cred, &unconf->cl_cred)) 
 			status = nfserr_clid_inuse;
@@ -876,7 +883,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confi
 	if ((conf && !unconf) || 
 	    ((conf && unconf) && 
 	     (!cmp_verf(&conf->cl_verifier, &unconf->cl_verifier) ||
-	      !cmp_name(&conf->cl_name, &unconf->cl_name)))) {
+	      !same_name(conf->cl_recdir, unconf->cl_recdir)))) {
 		if (!cmp_creds(&conf->cl_cred,&rqstp->rq_cred)) {
 			status = nfserr_clid_inuse;
 		} else {
@@ -3074,39 +3081,28 @@ out:
 }
 
 static inline struct nfs4_client_reclaim *
-alloc_reclaim(int namelen)
+alloc_reclaim(void)
 {
-	struct nfs4_client_reclaim *crp = NULL;
-
-	crp = kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
-	if (!crp)
-		return NULL;
-	crp->cr_name.data = kmalloc(namelen, GFP_KERNEL);
-	if (!crp->cr_name.data) {
-		kfree(crp);
-		return NULL;
-	}
-	return crp;
+	return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
 }
 
 /*
  * failure => all reset bets are off, nfserr_no_grace...
  */
 static int
-nfs4_client_to_reclaim(char *name, int namlen)
+nfs4_client_to_reclaim(char *name)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp = NULL;
 
-	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", namlen, name);
-	crp = alloc_reclaim(namlen);
+	dprintk("NFSD nfs4_client_to_reclaim NAME: %.*s\n", HEXDIR_LEN, name);
+	crp = alloc_reclaim();
 	if (!crp)
 		return 0;
-	strhashval = clientstr_hashval(name, namlen);
+	strhashval = clientstr_hashval(name);
 	INIT_LIST_HEAD(&crp->cr_strhash);
 	list_add(&crp->cr_strhash, &reclaim_str_hashtbl[strhashval]);
-	memcpy(crp->cr_name.data, name, namlen);
-	crp->cr_name.len = namlen;
+	memcpy(crp->cr_recdir, name, HEXDIR_LEN);
 	reclaim_str_hashtbl_size++;
 	return 1;
 }
@@ -3122,7 +3118,6 @@ nfs4_release_reclaim(void)
 			crp = list_entry(reclaim_str_hashtbl[i].next,
 			                struct nfs4_client_reclaim, cr_strhash);
 			list_del(&crp->cr_strhash);
-			kfree(crp->cr_name.data);
 			kfree(crp);
 			reclaim_str_hashtbl_size--;
 		}
@@ -3145,13 +3140,14 @@ nfs4_find_reclaim_client(clientid_t *clid)
 	if (clp == NULL)
 		return NULL;
 
-	dprintk("NFSD: nfs4_find_reclaim_client for %.*s\n",
-		            clp->cl_name.len, clp->cl_name.data);
+	dprintk("NFSD: nfs4_find_reclaim_client for %.*s with recdir %s\n",
+		            clp->cl_name.len, clp->cl_name.data,
+			    clp->cl_recdir);
 
 	/* find clp->cl_name in reclaim_str_hashtbl */
-	strhashval = clientstr_hashval(clp->cl_name.data, clp->cl_name.len);
+	strhashval = clientstr_hashval(clp->cl_recdir);
 	list_for_each_entry(crp, &reclaim_str_hashtbl[strhashval], cr_strhash) {
-		if (cmp_name(&crp->cr_name, &clp->cl_name)) {
+		if (same_name(crp->cr_recdir, clp->cl_recdir)) {
 			return crp;
 		}
 	}
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 296e6429fc3b..fdaa84addadb 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -109,6 +109,8 @@ struct nfs4_callback {
 	struct rpc_clnt *       cb_client;
 };
 
+#define HEXDIR_LEN     33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+
 /*
  * struct nfs4_client - one per client.  Clientids live here.
  * 	o Each nfs4_client is hashed by clientid.
@@ -126,6 +128,7 @@ struct nfs4_client {
 	struct list_head	cl_del_perclnt; /* list: delegations */
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
+	char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
 	nfs4_verifier		cl_verifier; 	/* generated by client */
 	time_t                  cl_time;        /* time of last lease renewal */
 	u32			cl_addr; 	/* client ipaddress */
@@ -143,7 +146,7 @@ struct nfs4_client {
  */
 struct nfs4_client_reclaim {
 	struct list_head	cr_strhash;	/* hash by cr_name */
-	struct xdr_netobj 	cr_name; 	/* id generated by client */
+	char			cr_recdir[HEXDIR_LEN]; /* recover dir */
 };
 
 static inline void
@@ -283,6 +286,7 @@ extern void nfs4_free_stateowner(struct kref *kref);
 extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
+extern int nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
 
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
-- 
cgit v1.2.3-59-g8ed1b


From fd39ca9a808c6026989bc2188868a0574eb37108 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:03 -0700
Subject: [PATCH] knfsd: nfsd4: make needlessly global code static

This patch contains the following possible cleanups:

- make needlessly global code static

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4acl.c          |  4 ++--
 fs/nfsd/nfs4callback.c     |  7 +++---
 fs/nfsd/nfs4idmap.c        | 12 +++++-----
 fs/nfsd/nfs4state.c        | 57 ++++++++++++++++++++++++----------------------
 fs/nfsd/nfs4xdr.c          |  4 ++--
 include/linux/nfsd/state.h |  8 -------
 6 files changed, 43 insertions(+), 49 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 11ebf6c4aa54..4a2105552ac4 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -125,7 +125,7 @@ static short ace2type(struct nfs4_ace *);
 static int _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int);
 static struct posix_acl *_nfsv4_to_posix_one(struct nfs4_acl *, unsigned int);
 int nfs4_acl_add_ace(struct nfs4_acl *, u32, u32, u32, int, uid_t);
-int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *);
+static int nfs4_acl_split(struct nfs4_acl *, struct nfs4_acl *);
 
 struct nfs4_acl *
 nfs4_acl_posix_to_nfsv4(struct posix_acl *pacl, struct posix_acl *dpacl,
@@ -775,7 +775,7 @@ out_err:
 	return pacl;
 }
 
-int
+static int
 nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
 {
 	struct list_head *h, *n;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 38c3e1c47d83..68bb245491f6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -54,7 +54,6 @@
 
 /* declarations */
 static void nfs4_cb_null(struct rpc_task *task);
-extern spinlock_t recall_lock;
 
 /* Index of predefined Linux callback client operations */
 
@@ -329,12 +328,12 @@ out:
         .p_bufsiz = MAX(NFS4_##argtype##_sz,NFS4_##restype##_sz) << 2,  \
 }
 
-struct rpc_procinfo     nfs4_cb_procedures[] = {
+static struct rpc_procinfo     nfs4_cb_procedures[] = {
     PROC(CB_NULL,      NULL,     enc_cb_null,     dec_cb_null),
     PROC(CB_RECALL,    COMPOUND,   enc_cb_recall,      dec_cb_recall),
 };
 
-struct rpc_version              nfs_cb_version4 = {
+static struct rpc_version       nfs_cb_version4 = {
         .number                 = 1,
         .nrprocs                = sizeof(nfs4_cb_procedures)/sizeof(nfs4_cb_procedures[0]),
         .procs                  = nfs4_cb_procedures
@@ -348,7 +347,7 @@ static struct rpc_version *	nfs_cb_version[] = {
 /*
  * Use the SETCLIENTID credential
  */
-struct rpc_cred *
+static struct rpc_cred *
 nfsd4_lookupcred(struct nfs4_client *clp, int taskflags)
 {
         struct auth_cred acred;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 4ba540841cf6..5605a26efc57 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -104,7 +104,7 @@ ent_update(struct ent *new, struct ent *itm)
 	ent_init(new, itm);
 }
 
-void
+static void
 ent_put(struct cache_head *ch, struct cache_detail *cd)
 {
 	if (cache_put(ch, cd)) {
@@ -186,7 +186,7 @@ warn_no_idmapd(struct cache_detail *detail)
 static int         idtoname_parse(struct cache_detail *, char *, int);
 static struct ent *idtoname_lookup(struct ent *, int);
 
-struct cache_detail idtoname_cache = {
+static struct cache_detail idtoname_cache = {
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= idtoname_table,
 	.name		= "nfs4.idtoname",
@@ -277,7 +277,7 @@ nametoid_hash(struct ent *ent)
 	return hash_str(ent->name, ENT_HASHBITS);
 }
 
-void
+static void
 nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
     int *blen)
 {
@@ -317,9 +317,9 @@ nametoid_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
 }
 
 static struct ent *nametoid_lookup(struct ent *, int);
-int                nametoid_parse(struct cache_detail *, char *, int);
+static int         nametoid_parse(struct cache_detail *, char *, int);
 
-struct cache_detail nametoid_cache = {
+static struct cache_detail nametoid_cache = {
 	.hash_size	= ENT_HASHMAX,
 	.hash_table	= nametoid_table,
 	.name		= "nfs4.nametoid",
@@ -330,7 +330,7 @@ struct cache_detail nametoid_cache = {
 	.warn_no_listener = warn_no_idmapd,
 };
 
-int
+static int
 nametoid_parse(struct cache_detail *cd, char *buf, int buflen)
 {
 	struct ent ent, *res;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1b2f67f5eef6..8a5f777b1e96 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,18 +54,21 @@
 /* Globals */
 static time_t lease_time = 90;     /* default lease time */
 static time_t user_lease_time = 90;
-time_t boot_time;
+static time_t boot_time;
 static int in_grace = 1;
 static u32 current_clientid = 1;
 static u32 current_ownerid = 1;
 static u32 current_fileid = 1;
 static u32 current_delegid = 1;
 static u32 nfs4_init;
-stateid_t zerostateid;             /* bits all 0 */
-stateid_t onestateid;              /* bits all 1 */
+static stateid_t zerostateid;             /* bits all 0 */
+static stateid_t onestateid;              /* bits all 1 */
+
+#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
+#define ONE_STATEID(stateid)  (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 
 /* forward declarations */
-struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
+static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
 
@@ -77,10 +80,10 @@ static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
  */
 static DECLARE_MUTEX(client_sema);
 
-kmem_cache_t *stateowner_slab = NULL;
-kmem_cache_t *file_slab = NULL;
-kmem_cache_t *stateid_slab = NULL;
-kmem_cache_t *deleg_slab = NULL;
+static kmem_cache_t *stateowner_slab = NULL;
+static kmem_cache_t *file_slab = NULL;
+static kmem_cache_t *stateid_slab = NULL;
+static kmem_cache_t *deleg_slab = NULL;
 
 void
 nfs4_lock_state(void)
@@ -116,7 +119,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags);
  */
 
 /* recall_lock protects the del_recall_lru */
-spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
 static struct list_head del_recall_lru;
 
 static void
@@ -456,7 +459,7 @@ check_name(struct xdr_netobj name) {
 	return 1;
 }
 
-void
+static void
 add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 {
 	unsigned int idhashval;
@@ -468,7 +471,7 @@ add_to_unconfirmed(struct nfs4_client *clp, unsigned int strhashval)
 	clp->cl_time = get_seconds();
 }
 
-void
+static void
 move_to_confirmed(struct nfs4_client *clp)
 {
 	unsigned int idhashval = clientid_hashval(clp->cl_clientid.cl_id);
@@ -567,7 +570,7 @@ parse_octet(unsigned int *lenp, char **addrp)
 }
 
 /* parse and set the setclientid ipv4 callback address */
-int
+static int
 parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigned short *cbportp)
 {
 	int temp = 0;
@@ -603,7 +606,7 @@ parse_ipv4(unsigned int addr_len, char *addr_val, unsigned int *cbaddrp, unsigne
 	return 1;
 }
 
-void
+static void
 gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 {
 	struct nfs4_callback *cb = &clp->cl_callback;
@@ -1186,7 +1189,7 @@ release_stateid(struct nfs4_stateid *stp, int flags)
 	stp = NULL;
 }
 
-void
+static void
 move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1196,7 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop)
 	sop->so_time = get_seconds();
 }
 
-void
+static void
 release_state_owner(struct nfs4_stateid *stp, int flag)
 {
 	struct nfs4_stateowner *sop = stp->st_stateowner;
@@ -1250,7 +1253,7 @@ find_file(struct inode *ino)
 #define TEST_ACCESS(x) ((x > 0 || x < 4)?1:0)
 #define TEST_DENY(x) ((x >= 0 || x < 5)?1:0)
 
-void
+static void
 set_access(unsigned int *access, unsigned long bmap) {
 	int i;
 
@@ -1261,7 +1264,7 @@ set_access(unsigned int *access, unsigned long bmap) {
 	}
 }
 
-void
+static void
 set_deny(unsigned int *deny, unsigned long bmap) {
 	int i;
 
@@ -1287,7 +1290,7 @@ test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) {
  * Called to check deny when READ with all zero stateid or
  * WRITE with all zero or all one stateid
  */
-int
+static int
 nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 {
 	struct inode *ino = current_fh->fh_dentry->d_inode;
@@ -1442,7 +1445,7 @@ int nfsd_change_deleg_cb(struct file_lock **onlist, int arg)
 		return -EAGAIN;
 }
 
-struct lock_manager_operations nfsd_lease_mng_ops = {
+static struct lock_manager_operations nfsd_lease_mng_ops = {
 	.fl_break = nfsd_break_deleg_cb,
 	.fl_release_private = nfsd_release_deleg_cb,
 	.fl_copy_lock = nfsd_copy_lock_deleg_cb,
@@ -1915,7 +1918,7 @@ end_grace(void)
 	in_grace = 0;
 }
 
-time_t
+static time_t
 nfs4_laundromat(void)
 {
 	struct nfs4_client *clp;
@@ -1996,7 +1999,7 @@ laundromat_main(void *not_used)
 /* search ownerid_hashtbl[] and close_lru for stateid owner
  * (stateid->si_stateownerid)
  */
-struct nfs4_stateowner *
+static struct nfs4_stateowner *
 find_openstateowner_id(u32 st_id, int flags) {
 	struct nfs4_stateowner *local = NULL;
 
@@ -2170,7 +2173,7 @@ out:
 /* 
  * Checks for sequence id mutating operations. 
  */
-int
+static int
 nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, clientid_t *lockclid)
 {
 	int status;
@@ -2486,7 +2489,7 @@ static struct list_head lock_ownerid_hashtbl[LOCK_HASH_SIZE];
 static struct list_head	lock_ownerstr_hashtbl[LOCK_HASH_SIZE];
 static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE];
 
-struct nfs4_stateid *
+static struct nfs4_stateid *
 find_stateid(stateid_t *stid, int flags)
 {
 	struct nfs4_stateid *local = NULL;
@@ -2550,7 +2553,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 		lock->fl_end = OFFSET_MAX;
 }
 
-int
+static int
 nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval)
 {
 	struct nfs4_stateowner *local = NULL;
@@ -2660,7 +2663,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	return sop;
 }
 
-struct nfs4_stateid *
+static struct nfs4_stateid *
 alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struct nfs4_stateid *open_stp)
 {
 	struct nfs4_stateid *stp;
@@ -2691,7 +2694,7 @@ out:
 	return stp;
 }
 
-int
+static int
 check_lock_length(u64 offset, u64 length)
 {
 	return ((length == 0)  || ((length != ~(u64)0) &&
@@ -3149,7 +3152,7 @@ nfs4_release_reclaim(void)
 
 /*
  * called from OPEN, CLAIM_PREVIOUS with a new clientid. */
-struct nfs4_client_reclaim *
+static struct nfs4_client_reclaim *
 nfs4_find_reclaim_client(clientid_t *clid)
 {
 	unsigned int strhashval;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index cfe978a72cea..91fb171d2ace 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -136,7 +136,7 @@ xdr_error:					\
 	}					\
 } while (0)
 
-u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
+static u32 *read_buf(struct nfsd4_compoundargs *argp, int nbytes)
 {
 	/* We want more bytes than seem to be available.
 	 * Maybe we need a new page, maybe we have just run out
@@ -190,7 +190,7 @@ defer_free(struct nfsd4_compoundargs *argp,
 	return 0;
 }
 
-char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
+static char *savemem(struct nfsd4_compoundargs *argp, u32 *p, int nbytes)
 {
 	void *new = NULL;
 	if (p == argp->tmp) {
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index fdaa84addadb..0e18ae22127d 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -61,11 +61,6 @@ typedef struct {
 #define si_stateownerid   si_opaque.so_stateownerid
 #define si_fileid         si_opaque.so_fileid
 
-extern stateid_t zerostateid;
-extern stateid_t onestateid;
-
-#define ZERO_STATEID(stateid)       (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
-#define ONE_STATEID(stateid)        (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
 
 struct nfs4_cb_recall {
 	u32			cbr_ident;
@@ -271,12 +266,9 @@ struct nfs4_stateid {
 	((err) != nfserr_stale_stateid) &&      \
 	((err) != nfserr_bad_stateid))
 
-extern time_t nfs4_laundromat(void);
 extern int nfsd4_renew(clientid_t *clid);
 extern int nfs4_preprocess_stateid_op(struct svc_fh *current_fh, 
 		stateid_t *stateid, int flags, struct file **filp);
-extern int nfs4_share_conflict(struct svc_fh *current_fh, 
-		unsigned int deny_type);
 extern void nfs4_lock_state(void);
 extern void nfs4_unlock_state(void);
 extern int nfs4_in_grace(void);
-- 
cgit v1.2.3-59-g8ed1b


From ea1da636e956ad1591a74904f23d98bbc26a644b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:17 -0700
Subject: [PATCH] knfsd: nfsd4: rename state list fields

Trivial renaming patch:

I can never remember, while looking at various lists relating the nfsd4 state
structures, which are the "heads" and which are items on other lists, or which
structures are actually on the various lists.  The following convention helps
me: given structures foo and bar, with foo containing the head of a list of
bars, use "bars" for the name of the head of the list contained in the struct
foo, and use "per_foo" for the entries in the struct bars.

Already done for struct nfs4_file; go ahead and do it for the other nfsd4
state structures.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c        | 78 +++++++++++++++++++++++-----------------------
 include/linux/nfsd/state.h | 18 +++++------
 2 files changed, 48 insertions(+), 48 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 884115198116..22e76e3f06a5 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -154,8 +154,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
-	INIT_LIST_HEAD(&dp->dl_del_perfile);
-	INIT_LIST_HEAD(&dp->dl_del_perclnt);
+	INIT_LIST_HEAD(&dp->dl_perfile);
+	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
 	dp->dl_client = clp;
 	get_nfs4_file(fp);
@@ -176,8 +176,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 		        current_fh->fh_handle.fh_size);
 	dp->dl_time = 0;
 	atomic_set(&dp->dl_count, 1);
-	list_add(&dp->dl_del_perfile, &fp->fi_delegations);
-	list_add(&dp->dl_del_perclnt, &clp->cl_del_perclnt);
+	list_add(&dp->dl_perfile, &fp->fi_delegations);
+	list_add(&dp->dl_perclnt, &clp->cl_delegations);
 	return dp;
 }
 
@@ -214,8 +214,8 @@ nfs4_close_delegation(struct nfs4_delegation *dp)
 static void
 unhash_delegation(struct nfs4_delegation *dp)
 {
-	list_del_init(&dp->dl_del_perfile);
-	list_del_init(&dp->dl_del_perclnt);
+	list_del_init(&dp->dl_perfile);
+	list_del_init(&dp->dl_perclnt);
 	spin_lock(&recall_lock);
 	list_del_init(&dp->dl_recall_lru);
 	spin_unlock(&recall_lock);
@@ -345,11 +345,11 @@ expire_client(struct nfs4_client *clp)
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
-	while (!list_empty(&clp->cl_del_perclnt)) {
-		dp = list_entry(clp->cl_del_perclnt.next, struct nfs4_delegation, dl_del_perclnt);
+	while (!list_empty(&clp->cl_delegations)) {
+		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
 		dprintk("NFSD: expire client. dp %p, fp %p\n", dp,
 				dp->dl_flock);
-		list_del_init(&dp->dl_del_perclnt);
+		list_del_init(&dp->dl_perclnt);
 		list_move(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&recall_lock);
@@ -361,8 +361,8 @@ expire_client(struct nfs4_client *clp)
 	list_del(&clp->cl_idhash);
 	list_del(&clp->cl_strhash);
 	list_del(&clp->cl_lru);
-	while (!list_empty(&clp->cl_perclient)) {
-		sop = list_entry(clp->cl_perclient.next, struct nfs4_stateowner, so_perclient);
+	while (!list_empty(&clp->cl_openowners)) {
+		sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
 		release_stateowner(sop);
 	}
 	put_nfs4_client(clp);
@@ -380,8 +380,8 @@ create_client(struct xdr_netobj name, char *recdir) {
 	clp->cl_callback.cb_parsed = 0;
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
-	INIT_LIST_HEAD(&clp->cl_perclient);
-	INIT_LIST_HEAD(&clp->cl_del_perclnt);
+	INIT_LIST_HEAD(&clp->cl_openowners);
+	INIT_LIST_HEAD(&clp->cl_delegations);
 	INIT_LIST_HEAD(&clp->cl_lru);
 out:
 	return clp;
@@ -1074,13 +1074,13 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	INIT_LIST_HEAD(&sop->so_idhash);
 	INIT_LIST_HEAD(&sop->so_strhash);
 	INIT_LIST_HEAD(&sop->so_perclient);
-	INIT_LIST_HEAD(&sop->so_perfilestate);
-	INIT_LIST_HEAD(&sop->so_perlockowner);  /* not used */
+	INIT_LIST_HEAD(&sop->so_stateids);
+	INIT_LIST_HEAD(&sop->so_perstateid);  /* not used */
 	INIT_LIST_HEAD(&sop->so_close_lru);
 	sop->so_time = 0;
 	list_add(&sop->so_idhash, &ownerid_hashtbl[idhashval]);
 	list_add(&sop->so_strhash, &ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perclient, &clp->cl_perclient);
+	list_add(&sop->so_perclient, &clp->cl_openowners);
 	sop->so_is_open_owner = 1;
 	sop->so_id = current_ownerid++;
 	sop->so_client = clp;
@@ -1098,10 +1098,10 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp)
 {
 	struct nfs4_stateowner *lock_sop;
 
-	while (!list_empty(&open_stp->st_perlockowner)) {
-		lock_sop = list_entry(open_stp->st_perlockowner.next,
-				struct nfs4_stateowner, so_perlockowner);
-		/* list_del(&open_stp->st_perlockowner);  */
+	while (!list_empty(&open_stp->st_lockowners)) {
+		lock_sop = list_entry(open_stp->st_lockowners.next,
+				struct nfs4_stateowner, so_perstateid);
+		/* list_del(&open_stp->st_lockowners);  */
 		BUG_ON(lock_sop->so_is_open_owner);
 		release_stateowner(lock_sop);
 	}
@@ -1116,10 +1116,10 @@ unhash_stateowner(struct nfs4_stateowner *sop)
 	list_del(&sop->so_strhash);
 	if (sop->so_is_open_owner)
 		list_del(&sop->so_perclient);
-	list_del(&sop->so_perlockowner);
-	while (!list_empty(&sop->so_perfilestate)) {
-		stp = list_entry(sop->so_perfilestate.next, 
-			struct nfs4_stateid, st_perfilestate);
+	list_del(&sop->so_perstateid);
+	while (!list_empty(&sop->so_stateids)) {
+		stp = list_entry(sop->so_stateids.next,
+			struct nfs4_stateid, st_perstateowner);
 		if (sop->so_is_open_owner)
 			release_stateid(stp, OPEN_STATE);
 		else
@@ -1141,11 +1141,11 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	unsigned int hashval = stateid_hashval(sop->so_id, fp->fi_id);
 
 	INIT_LIST_HEAD(&stp->st_hash);
-	INIT_LIST_HEAD(&stp->st_perfilestate);
-	INIT_LIST_HEAD(&stp->st_perlockowner);
+	INIT_LIST_HEAD(&stp->st_perstateowner);
+	INIT_LIST_HEAD(&stp->st_lockowners);
 	INIT_LIST_HEAD(&stp->st_perfile);
 	list_add(&stp->st_hash, &stateid_hashtbl[hashval]);
-	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
+	list_add(&stp->st_perstateowner, &sop->so_stateids);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
 	stp->st_stateowner = sop;
 	get_nfs4_file(fp);
@@ -1167,7 +1167,7 @@ release_stateid(struct nfs4_stateid *stp, int flags)
 
 	list_del(&stp->st_hash);
 	list_del(&stp->st_perfile);
-	list_del(&stp->st_perfilestate);
+	list_del(&stp->st_perstateowner);
 	if (flags & OPEN_STATE) {
 		release_stateid_lockowners(stp);
 		stp->st_vfs_file = NULL;
@@ -1201,7 +1201,7 @@ release_state_owner(struct nfs4_stateid *stp, int flag)
 	 * released by the laundromat service after the lease period
 	 * to enable us to handle CLOSE replay
 	 */
-	if (sop->so_confirmed && list_empty(&sop->so_perfilestate))
+	if (sop->so_confirmed && list_empty(&sop->so_stateids))
 		move_to_close_lru(sop);
 }
 
@@ -1548,7 +1548,7 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
 	struct nfs4_delegation *dp;
 
-	list_for_each_entry(dp, &fp->fi_delegations, dl_del_perfile) {
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
 		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
 			return dp;
 	}
@@ -1892,7 +1892,7 @@ nfsd4_renew(clientid_t *clid)
 	}
 	renew_client(clp);
 	status = nfserr_cb_path_down;
-	if (!list_empty(&clp->cl_del_perclnt)
+	if (!list_empty(&clp->cl_delegations)
 			&& !atomic_read(&clp->cl_callback.cb_set))
 		goto out;
 	status = nfs_ok;
@@ -2634,13 +2634,13 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
 	INIT_LIST_HEAD(&sop->so_idhash);
 	INIT_LIST_HEAD(&sop->so_strhash);
 	INIT_LIST_HEAD(&sop->so_perclient);
-	INIT_LIST_HEAD(&sop->so_perfilestate);
-	INIT_LIST_HEAD(&sop->so_perlockowner);
+	INIT_LIST_HEAD(&sop->so_stateids);
+	INIT_LIST_HEAD(&sop->so_perstateid);
 	INIT_LIST_HEAD(&sop->so_close_lru); /* not used */
 	sop->so_time = 0;
 	list_add(&sop->so_idhash, &lock_ownerid_hashtbl[idhashval]);
 	list_add(&sop->so_strhash, &lock_ownerstr_hashtbl[strhashval]);
-	list_add(&sop->so_perlockowner, &open_stp->st_perlockowner);
+	list_add(&sop->so_perstateid, &open_stp->st_lockowners);
 	sop->so_is_open_owner = 0;
 	sop->so_id = current_ownerid++;
 	sop->so_client = clp;
@@ -2664,11 +2664,11 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 		goto out;
 	INIT_LIST_HEAD(&stp->st_hash);
 	INIT_LIST_HEAD(&stp->st_perfile);
-	INIT_LIST_HEAD(&stp->st_perfilestate);
-	INIT_LIST_HEAD(&stp->st_perlockowner); /* not used */
+	INIT_LIST_HEAD(&stp->st_perstateowner);
+	INIT_LIST_HEAD(&stp->st_lockowners); /* not used */
 	list_add(&stp->st_hash, &lockstateid_hashtbl[hashval]);
 	list_add(&stp->st_perfile, &fp->fi_stateids);
-	list_add(&stp->st_perfilestate, &sop->so_perfilestate);
+	list_add(&stp->st_perstateowner, &sop->so_stateids);
 	stp->st_stateowner = sop;
 	get_nfs4_file(fp);
 	stp->st_file = fp;
@@ -3081,8 +3081,8 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, struct nfsd4_release_lockowner *
 		/* check for any locks held by any stateid
 		 * associated with the (lock) stateowner */
 		status = nfserr_locks_held;
-		list_for_each_entry(stp, &local->so_perfilestate,
-				st_perfilestate) {
+		list_for_each_entry(stp, &local->so_stateids,
+				st_perstateowner) {
 			if (check_for_locks(stp->st_vfs_file, local))
 				goto out;
 		}
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 0e18ae22127d..f4f27b76ee64 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -72,8 +72,8 @@ struct nfs4_cb_recall {
 };
 
 struct nfs4_delegation {
-	struct list_head	dl_del_perfile; /* nfs4_file->fi_del_perfile */
-	struct list_head	dl_del_perclnt; /* nfs4_client->cl_del_perclnt*/
+	struct list_head	dl_perfile;
+	struct list_head	dl_perclnt;
 	struct list_head	dl_recall_lru;  /* delegation recalled */
 	atomic_t		dl_count;       /* ref count */
 	struct nfs4_client	*dl_client;
@@ -119,8 +119,8 @@ struct nfs4_callback {
 struct nfs4_client {
 	struct list_head	cl_idhash; 	/* hash by cl_clientid.id */
 	struct list_head	cl_strhash; 	/* hash by cl_name */
-	struct list_head	cl_perclient; 	/* list: stateowners */
-	struct list_head	cl_del_perclnt; /* list: delegations */
+	struct list_head	cl_openowners;
+	struct list_head	cl_delegations;
 	struct list_head        cl_lru;         /* tail queue */
 	struct xdr_netobj	cl_name; 	/* id generated by client */
 	char                    cl_recdir[HEXDIR_LEN]; /* recovery dir */
@@ -195,9 +195,9 @@ struct nfs4_stateowner {
 	struct kref		so_ref;
 	struct list_head        so_idhash;   /* hash by so_id */
 	struct list_head        so_strhash;   /* hash by op_name */
-	struct list_head        so_perclient; /* nfs4_client->cl_perclient */
-	struct list_head        so_perfilestate; /* list: nfs4_stateid */
-	struct list_head        so_perlockowner; /* nfs4_stateid->st_perlockowner */
+	struct list_head        so_perclient;
+	struct list_head        so_stateids;
+	struct list_head        so_perstateid; /* for lockowners only */
 	struct list_head	so_close_lru; /* tail queue */
 	time_t			so_time; /* time of placement on so_close_lru */
 	int			so_is_open_owner; /* 1=openowner,0=lockowner */
@@ -240,8 +240,8 @@ struct nfs4_file {
 struct nfs4_stateid {
 	struct list_head              st_hash; 
 	struct list_head              st_perfile;
-	struct list_head              st_perfilestate; 
-	struct list_head              st_perlockowner;
+	struct list_head              st_perstateowner;
+	struct list_head              st_lockowners;
 	struct nfs4_stateowner      * st_stateowner;
 	struct nfs4_file            * st_file;
 	stateid_t                     st_stateid;
-- 
cgit v1.2.3-59-g8ed1b


From cb36d6345752fa24827044c68e15f6708a40d9f6 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:23 -0700
Subject: [PATCH] knfsd: nfsd4: remove cb_parsed

The cb_parsed field is only used by probe_callback, to determine whether the
callback information has been filled in by setclientid.  But there is no way
that probe_callback() can be called without that having already happened, so
that check is superfluous, as is cb_parsed.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4callback.c     | 4 +---
 fs/nfsd/nfs4state.c        | 5 +----
 include/linux/nfsd/state.h | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 68bb245491f6..583c0710e45e 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -386,9 +386,7 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 	char                    hostname[32];
 	int status;
 
-	dprintk("NFSD: probe_callback. cb_parsed %d cb_set %d\n",
-			cb->cb_parsed, atomic_read(&cb->cb_set));
-	if (!cb->cb_parsed || atomic_read(&cb->cb_set))
+	if (atomic_read(&cb->cb_set))
 		return;
 
 	/* Initialize address */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 26d00465c28a..0b47a97e953d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -377,7 +377,6 @@ create_client(struct xdr_netobj name, char *recdir) {
 	memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
 	atomic_set(&clp->cl_count, 1);
 	atomic_set(&clp->cl_callback.cb_set, 0);
-	clp->cl_callback.cb_parsed = 0;
 	INIT_LIST_HEAD(&clp->cl_idhash);
 	INIT_LIST_HEAD(&clp->cl_strhash);
 	INIT_LIST_HEAD(&clp->cl_openowners);
@@ -620,14 +619,12 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se)
 		goto out_err;
 	cb->cb_prog = se->se_callback_prog;
 	cb->cb_ident = se->se_callback_ident;
-	cb->cb_parsed = 1;
 	return;
 out_err:
 	printk(KERN_INFO "NFSD: this client (clientid %08x/%08x) "
 		"will not receive delegations\n",
 		clp->cl_clientid.cl_boot, clp->cl_clientid.cl_id);
 
-	cb->cb_parsed = 0;
 	return;
 }
 
@@ -872,7 +869,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confi
 		else {
 			/* XXX: We just turn off callbacks until we can handle
 			  * change request correctly. */
-			conf->cl_callback.cb_parsed = 0;
+			atomic_set(&conf->cl_callback.cb_set, 0);
 			gen_confirm(conf);
 			expire_client(unconf);
 			status = nfs_ok;
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index f4f27b76ee64..83d29ec03a58 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -92,7 +92,6 @@ struct nfs4_delegation {
 /* client delegation callback info */
 struct nfs4_callback {
 	/* SETCLIENTID info */
-	u32			cb_parsed;  /* addr parsed */
 	u32                     cb_addr;
 	unsigned short          cb_port;
 	u32                     cb_prog;
-- 
cgit v1.2.3-59-g8ed1b


From 190e4fbf96037e5e526ba3210f2bcc2a3b6fe964 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:25 -0700
Subject: [PATCH] knfsd: nfsd4: initialize recovery directory

NFSv4 clients are required to know what state they have on the server so that
they can reclaim it on server reboot.  However, it is possible for
pathalogical combinations of server reboots and network partitions to leave a
client in a state where it cannot know whether it has lost its state on the
server.

For this reason, rfc3530 requires that we store some information about clients
to stable storage.

So we maintain a directory /var/lib/nfs/v4recovery with a subdirectory for
each client with active state.  We leave open the possibility of including
files underneath each such subdirectory with information about the client, but
for now the subdirectories are empty.

We create a client subdirectory whenever a client makes its first non-reclaim
open_confirm.

We remove a client subdirectory whenever either
        a) its lease expires, or
	b) the grace period ends without it reclaiming anything.
When handling reclaims, we allow the reclaim if and only if the client doing
the reclaim has a subdirectory.

This patch adds just the code to scan the recovery directory on nfsd startup.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4recover.c      | 166 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        |  18 ++++-
 include/linux/nfsd/state.h |   4 ++
 3 files changed, 186 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 841a305d7948..2dc9851a1d37 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -39,6 +39,9 @@
 #include <linux/nfs4.h>
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
+#include <linux/param.h>
+#include <linux/file.h>
+#include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
@@ -46,6 +49,27 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
+/* Globals */
+char recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+static struct nameidata rec_dir;
+static int rec_dir_init = 0;
+
+static void
+nfs4_save_user(uid_t *saveuid, gid_t *savegid)
+{
+	*saveuid = current->fsuid;
+	*savegid = current->fsgid;
+	current->fsuid = 0;
+	current->fsgid = 0;
+}
+
+static void
+nfs4_reset_user(uid_t saveuid, gid_t savegid)
+{
+	current->fsuid = saveuid;
+	current->fsgid = savegid;
+}
+
 static void
 md5_to_hex(char *out, char *md5)
 {
@@ -95,3 +119,145 @@ out:
 		crypto_free_tfm(tfm);
 	return status;
 }
+
+typedef int (recdir_func)(struct dentry *, struct dentry *);
+
+struct dentry_list {
+	struct dentry *dentry;
+	struct list_head list;
+};
+
+struct dentry_list_arg {
+	struct list_head dentries;
+	struct dentry *parent;
+};
+
+static int
+nfsd4_build_dentrylist(void *arg, const char *name, int namlen,
+		loff_t offset, ino_t ino, unsigned int d_type)
+{
+	struct dentry_list_arg *dla = arg;
+	struct list_head *dentries = &dla->dentries;
+	struct dentry *parent = dla->parent;
+	struct dentry *dentry;
+	struct dentry_list *child;
+
+	if (name && isdotent(name, namlen))
+		return nfs_ok;
+	dentry = lookup_one_len(name, parent, namlen);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	child = kmalloc(sizeof(*child), GFP_KERNEL);
+	if (child == NULL)
+		return -ENOMEM;
+	child->dentry = dentry;
+	list_add(&child->list, dentries);
+	return 0;
+}
+
+static int
+nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
+{
+	struct file *filp;
+	struct dentry_list_arg dla = {
+		.parent = dir,
+	};
+	struct list_head *dentries = &dla.dentries;
+	struct dentry_list *child;
+	uid_t uid;
+	gid_t gid;
+	int status;
+
+	if (!rec_dir_init)
+		return 0;
+
+	nfs4_save_user(&uid, &gid);
+
+	filp = dentry_open(dget(dir), mntget(rec_dir.mnt),
+			O_RDWR);
+	status = PTR_ERR(filp);
+	if (IS_ERR(filp))
+		goto out;
+	INIT_LIST_HEAD(dentries);
+	status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
+	fput(filp);
+	while (!list_empty(dentries)) {
+		child = list_entry(dentries->next, struct dentry_list, list);
+		status = f(dir, child->dentry);
+		if (status)
+			goto out;
+		list_del(&child->list);
+		dput(child->dentry);
+		kfree(child);
+	}
+out:
+	while (!list_empty(dentries)) {
+		child = list_entry(dentries->next, struct dentry_list, list);
+		list_del(&child->list);
+		dput(child->dentry);
+		kfree(child);
+	}
+	nfs4_reset_user(uid, gid);
+	return status;
+}
+
+static int
+load_recdir(struct dentry *parent, struct dentry *child)
+{
+	if (child->d_name.len != HEXDIR_LEN - 1) {
+		printk("nfsd4: illegal name %s in recovery directory\n",
+				child->d_name.name);
+		/* Keep trying; maybe the others are OK: */
+		return nfs_ok;
+	}
+	nfs4_client_to_reclaim(child->d_name.name);
+	return nfs_ok;
+}
+
+int
+nfsd4_recdir_load(void) {
+	int status;
+
+	status = nfsd4_list_rec_dir(rec_dir.dentry, load_recdir);
+	if (status)
+		printk("nfsd4: failed loading clients from recovery"
+			" directory %s\n", rec_dir.dentry->d_name.name);
+	return status;
+}
+
+/*
+ * Hold reference to the recovery directory.
+ */
+
+void
+nfsd4_init_recdir(char *rec_dirname)
+{
+	uid_t			uid = 0;
+	gid_t			gid = 0;
+	int 			status;
+
+	printk("NFSD: Using %s as the NFSv4 state recovery directory\n",
+			rec_dirname);
+
+	BUG_ON(rec_dir_init);
+
+	nfs4_save_user(&uid, &gid);
+
+	status = path_lookup(rec_dirname, LOOKUP_FOLLOW, &rec_dir);
+	if (status == -ENOENT)
+		printk("NFSD: recovery directory %s doesn't exist\n",
+				rec_dirname);
+
+	if (!status)
+		rec_dir_init = 1;
+	nfs4_reset_user(uid, gid);
+}
+
+void
+nfsd4_shutdown_recdir(void)
+{
+	if (!rec_dir_init)
+		return;
+	rec_dir_init = 0;
+	path_release(&rec_dir);
+}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0b47a97e953d..6b9d23c39afe 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -71,6 +71,7 @@ static stateid_t onestateid;              /* bits all 1 */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
+extern char recovery_dirname[];
 
 /* Locking:
  *
@@ -3091,8 +3092,8 @@ alloc_reclaim(void)
 /*
  * failure => all reset bets are off, nfserr_no_grace...
  */
-static int
-nfs4_client_to_reclaim(char *name)
+int
+nfs4_client_to_reclaim(const char *name)
 {
 	unsigned int strhashval;
 	struct nfs4_client_reclaim *crp = NULL;
@@ -3202,6 +3203,17 @@ nfs4_state_init(void)
 	reclaim_str_hashtbl_size = 0;
 }
 
+static void
+nfsd4_load_reboot_recovery_data(void)
+{
+	int status;
+
+	nfsd4_init_recdir(recovery_dirname);
+	status = nfsd4_recdir_load();
+	if (status)
+		printk("NFSD: Failure reading reboot recovery data\n");
+}
+
 /* initialization to perform when the nfsd service is started: */
 
 static void
@@ -3228,6 +3240,7 @@ nfs4_state_start(void)
 	status = nfsd4_init_slabs();
 	if (status)
 		return status;
+	nfsd4_load_reboot_recovery_data();
 	__nfs4_state_start();
 	nfs4_init = 1;
 	return 0;
@@ -3286,6 +3299,7 @@ __nfs4_state_shutdown(void)
 	cancel_delayed_work(&laundromat_work);
 	flush_workqueue(laundry_wq);
 	destroy_workqueue(laundry_wq);
+	nfsd4_shutdown_recdir();
 	nfs4_init = 0;
 }
 
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 83d29ec03a58..19481ab122df 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -278,6 +278,10 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_cb_recall(struct nfs4_delegation *dp);
 extern void nfs4_put_delegation(struct nfs4_delegation *dp);
 extern int nfs4_make_rec_clidname(char *clidname, struct xdr_netobj *clname);
+extern void nfsd4_init_recdir(char *recdir_name);
+extern int nfsd4_recdir_load(void);
+extern void nfsd4_shutdown_recdir(void);
+extern int nfs4_client_to_reclaim(const char *name);
 
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
-- 
cgit v1.2.3-59-g8ed1b


From c7b9a45927e74c81d6562153f7fde9d32da00159 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:30 -0700
Subject: [PATCH] knfsd: nfsd4: reboot recovery

This patch adds the code to create and remove client subdirectories from the
recovery directory, as described in the previous patch comment.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4recover.c      | 169 +++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/nfs4state.c        |  16 +++++
 include/linux/nfsd/state.h |   5 ++
 3 files changed, 190 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 2dc9851a1d37..2805c5245eac 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -120,6 +120,70 @@ out:
 	return status;
 }
 
+static int
+nfsd4_rec_fsync(struct dentry *dentry)
+{
+	struct file *filp;
+	int status = nfs_ok;
+
+	dprintk("NFSD: nfs4_fsync_rec_dir\n");
+	filp = dentry_open(dget(dentry), mntget(rec_dir.mnt), O_RDWR);
+	if (IS_ERR(filp)) {
+		status = PTR_ERR(filp);
+		goto out;
+	}
+	if (filp->f_op && filp->f_op->fsync)
+		status = filp->f_op->fsync(filp, filp->f_dentry, 0);
+	fput(filp);
+out:
+	if (status)
+		printk("nfsd4: unable to sync recovery directory\n");
+	return status;
+}
+
+int
+nfsd4_create_clid_dir(struct nfs4_client *clp)
+{
+	char *dname = clp->cl_recdir;
+	struct dentry *dentry;
+	uid_t uid;
+	gid_t gid;
+	int status;
+
+	dprintk("NFSD: nfsd4_create_clid_dir for \"%s\"\n", dname);
+
+	if (!rec_dir_init || clp->cl_firststate)
+		return 0;
+
+	nfs4_save_user(&uid, &gid);
+
+	/* lock the parent */
+	down(&rec_dir.dentry->d_inode->i_sem);
+
+	dentry = lookup_one_len(dname, rec_dir.dentry, HEXDIR_LEN-1);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+	status = -EEXIST;
+	if (dentry->d_inode) {
+		dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
+		goto out_put;
+	}
+	status = vfs_mkdir(rec_dir.dentry->d_inode, dentry, S_IRWXU);
+out_put:
+	dput(dentry);
+out_unlock:
+	up(&rec_dir.dentry->d_inode->i_sem);
+	if (status == 0) {
+		clp->cl_firststate = 1;
+		status = nfsd4_rec_fsync(rec_dir.dentry);
+	}
+	nfs4_reset_user(uid, gid);
+	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
+	return status;
+}
+
 typedef int (recdir_func)(struct dentry *, struct dentry *);
 
 struct dentry_list {
@@ -201,6 +265,111 @@ out:
 	return status;
 }
 
+static int
+nfsd4_remove_clid_file(struct dentry *dir, struct dentry *dentry)
+{
+	int status;
+
+	if (!S_ISREG(dir->d_inode->i_mode)) {
+		printk("nfsd4: non-file found in client recovery directory\n");
+		return -EINVAL;
+	}
+	down(&dir->d_inode->i_sem);
+	status = vfs_unlink(dir->d_inode, dentry);
+	up(&dir->d_inode->i_sem);
+	return status;
+}
+
+static int
+nfsd4_clear_clid_dir(struct dentry *dir, struct dentry *dentry)
+{
+	int status;
+
+	/* For now this directory should already be empty, but we empty it of
+	 * any regular files anyway, just in case the directory was created by
+	 * a kernel from the future.... */
+	nfsd4_list_rec_dir(dentry, nfsd4_remove_clid_file);
+	down(&dir->d_inode->i_sem);
+	status = vfs_rmdir(dir->d_inode, dentry);
+	up(&dir->d_inode->i_sem);
+	return status;
+}
+
+static int
+nfsd4_unlink_clid_dir(char *name, int namlen)
+{
+	struct dentry *dentry;
+	int status;
+
+	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
+
+	dentry = lookup_one_len(name, rec_dir.dentry, namlen);
+	if (IS_ERR(dentry)) {
+		status = PTR_ERR(dentry);
+		return status;
+	}
+	status = -ENOENT;
+	if (!dentry->d_inode)
+		goto out;
+
+	status = nfsd4_clear_clid_dir(rec_dir.dentry, dentry);
+out:
+	dput(dentry);
+	return status;
+}
+
+void
+nfsd4_remove_clid_dir(struct nfs4_client *clp)
+{
+	uid_t uid;
+	gid_t gid;
+	int status;
+
+	if (!rec_dir_init || !clp->cl_firststate)
+		return;
+
+	nfs4_save_user(&uid, &gid);
+	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
+	nfs4_reset_user(uid, gid);
+	if (status == 0)
+		status = nfsd4_rec_fsync(rec_dir.dentry);
+	if (status)
+		printk("NFSD: Failed to remove expired client state directory"
+				" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
+	return;
+}
+
+static int
+purge_old(struct dentry *parent, struct dentry *child)
+{
+	int status;
+
+	if (nfs4_has_reclaimed_state(child->d_name.name))
+		return nfs_ok;
+
+	status = nfsd4_clear_clid_dir(parent, child);
+	if (status)
+		printk("failed to remove client recovery directory %s\n",
+				child->d_name.name);
+	/* Keep trying, success or failure: */
+	return nfs_ok;
+}
+
+void
+nfsd4_recdir_purge_old(void) {
+	int status;
+
+	if (!rec_dir_init)
+		return;
+	status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
+	if (status == 0)
+		status = nfsd4_rec_fsync(rec_dir.dentry);
+	if (status)
+		printk("nfsd4: failed to purge old clients from recovery"
+			" directory %s\n", rec_dir.dentry->d_name.name);
+	return;
+}
+
 static int
 load_recdir(struct dentry *parent, struct dentry *child)
 {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6b9d23c39afe..6cca358cd650 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -905,6 +905,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp, struct nfsd4_setclientid_confi
 			conf = find_confirmed_client_by_str(unconf->cl_recdir,
 									hash);
 			if (conf) {
+				nfsd4_remove_clid_dir(conf);
 				expire_client(conf);
 			}
 			move_to_confirmed(unconf);
@@ -1691,6 +1692,7 @@ nfs4_set_claim_prev(struct nfsd4_open *open, int *status)
 			*status = nfserr_reclaim_bad;
 		else {
 			open->op_stateowner->so_confirmed = 1;
+			open->op_stateowner->so_client->cl_firststate = 1;
 			open->op_stateowner->so_seqid--;
 		}
 	}
@@ -1903,6 +1905,7 @@ static void
 end_grace(void)
 {
 	dprintk("NFSD: end of grace period\n");
+	nfsd4_recdir_purge_old();
 	in_grace = 0;
 }
 
@@ -1932,6 +1935,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused client (clientid %08x)\n",
 			clp->cl_clientid.cl_id);
+		nfsd4_remove_clid_dir(clp);
 		expire_client(clp);
 	}
 	INIT_LIST_HEAD(&reaplist);
@@ -2320,6 +2324,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
 		         stp->st_stateid.si_stateownerid,
 		         stp->st_stateid.si_fileid,
 		         stp->st_stateid.si_generation);
+
+	nfsd4_create_clid_dir(sop->so_client);
 out:
 	if (oc->oc_stateowner)
 		nfs4_get_stateowner(oc->oc_stateowner);
@@ -3089,6 +3095,16 @@ alloc_reclaim(void)
 	return kmalloc(sizeof(struct nfs4_client_reclaim), GFP_KERNEL);
 }
 
+int
+nfs4_has_reclaimed_state(const char *name)
+{
+	unsigned int strhashval = clientstr_hashval(name);
+	struct nfs4_client *clp;
+
+	clp = find_confirmed_client_by_str(name, strhashval);
+	return clp ? 1 : 0;
+}
+
 /*
  * failure => all reset bets are off, nfserr_no_grace...
  */
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 19481ab122df..a84a3fa99be1 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -131,6 +131,7 @@ struct nfs4_client {
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	struct nfs4_callback	cl_callback;    /* callback info */
 	atomic_t		cl_count;	/* ref count */
+	u32			cl_firststate;	/* recovery dir creation */
 };
 
 /* struct nfs4_client_reset
@@ -282,6 +283,10 @@ extern void nfsd4_init_recdir(char *recdir_name);
 extern int nfsd4_recdir_load(void);
 extern void nfsd4_shutdown_recdir(void);
 extern int nfs4_client_to_reclaim(const char *name);
+extern int nfs4_has_reclaimed_state(const char *name);
+extern void nfsd4_recdir_purge_old(void);
+extern int nfsd4_create_clid_dir(struct nfs4_client *clp);
+extern void nfsd4_remove_clid_dir(struct nfs4_client *clp);
 
 static inline void
 nfs4_put_stateowner(struct nfs4_stateowner *so)
-- 
cgit v1.2.3-59-g8ed1b


From 0964a3d3f1aa96468091924f6b0c391a46dc6d0b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 23 Jun 2005 22:04:32 -0700
Subject: [PATCH] knfsd: nfsd4 reboot dirname fix

Set the recovery directory via /proc/fs/nfsd/nfs4recoverydir.

It may be changed any time, but is used only on startup.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4recover.c     |  1 -
 fs/nfsd/nfs4state.c       | 37 +++++++++++++++++++++++++++++++++++--
 fs/nfsd/nfsctl.c          | 23 +++++++++++++++++++++++
 include/linux/nfsd/nfsd.h |  2 ++
 4 files changed, 60 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 2805c5245eac..095f1740f3ae 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -50,7 +50,6 @@
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
 /* Globals */
-char recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
 static struct nameidata rec_dir;
 static int rec_dir_init = 0;
 
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6cca358cd650..89e36526d7f2 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include <linux/nfs4.h>
 #include <linux/nfsd/state.h>
 #include <linux/nfsd/xdr4.h>
+#include <linux/namei.h>
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -71,7 +72,8 @@ static stateid_t onestateid;              /* bits all 1 */
 static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
 static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
 static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
-extern char recovery_dirname[];
+static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
+static void nfs4_set_recdir(char *recdir);
 
 /* Locking:
  *
@@ -3224,8 +3226,10 @@ nfsd4_load_reboot_recovery_data(void)
 {
 	int status;
 
-	nfsd4_init_recdir(recovery_dirname);
+	nfs4_lock_state();
+	nfsd4_init_recdir(user_recovery_dirname);
 	status = nfsd4_recdir_load();
+	nfs4_unlock_state();
 	if (status)
 		printk("NFSD: Failure reading reboot recovery data\n");
 }
@@ -3329,6 +3333,35 @@ nfs4_state_shutdown(void)
 	nfs4_unlock_state();
 }
 
+static void
+nfs4_set_recdir(char *recdir)
+{
+	nfs4_lock_state();
+	strcpy(user_recovery_dirname, recdir);
+	nfs4_unlock_state();
+}
+
+/*
+ * Change the NFSv4 recovery directory to recdir.
+ */
+int
+nfs4_reset_recoverydir(char *recdir)
+{
+	int status;
+	struct nameidata nd;
+
+	status = path_lookup(recdir, LOOKUP_FOLLOW, &nd);
+	if (status)
+		return status;
+	status = -ENOTDIR;
+	if (S_ISDIR(nd.dentry->d_inode->i_mode)) {
+		nfs4_set_recdir(recdir);
+		status = 0;
+	}
+	path_release(&nd);
+	return status;
+}
+
 /*
  * Called when leasetime is changed.
  *
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3da43a3ed32c..841c562991e8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -51,6 +51,7 @@ enum {
 	NFSD_Fh,
 	NFSD_Threads,
 	NFSD_Leasetime,
+	NFSD_RecoveryDir,
 };
 
 /*
@@ -66,6 +67,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size);
 static ssize_t write_filehandle(struct file *file, char *buf, size_t size);
 static ssize_t write_threads(struct file *file, char *buf, size_t size);
 static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
 
 static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Svc] = write_svc,
@@ -78,6 +80,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = {
 	[NFSD_Fh] = write_filehandle,
 	[NFSD_Threads] = write_threads,
 	[NFSD_Leasetime] = write_leasetime,
+	[NFSD_RecoveryDir] = write_recoverydir,
 };
 
 static ssize_t nfsctl_transaction_write(struct file *file, const char __user *buf, size_t size, loff_t *pos)
@@ -349,6 +352,25 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size)
 	return strlen(buf);
 }
 
+static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
+{
+	char *mesg = buf;
+	char *recdir;
+	int len, status;
+
+	if (size > PATH_MAX || buf[size-1] != '\n')
+		return -EINVAL;
+	buf[size-1] = 0;
+
+	recdir = mesg;
+	len = qword_get(&mesg, recdir, size);
+	if (len <= 0)
+		return -EINVAL;
+
+	status = nfs4_reset_recoverydir(recdir);
+	return strlen(buf);
+}
+
 /*----------------------------------------------------------------------------*/
 /*
  *	populating the filesystem.
@@ -369,6 +391,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 		[NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
 #ifdef CONFIG_NFSD_V4
 		[NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+		[NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR},
 #endif
 		/* last one */ {""}
 	};
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 21c6e9d86e4f..5791dfd30dd0 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -150,12 +150,14 @@ int nfs4_state_start(void);
 void nfs4_state_shutdown(void);
 time_t nfs4_lease_time(void);
 void nfs4_reset_lease(time_t leasetime);
+int nfs4_reset_recoverydir(char *recdir);
 #else
 static inline void nfs4_state_init(void){};
 static inline int nfs4_state_start(void){return 0;}
 static inline void nfs4_state_shutdown(void){}
 static inline time_t nfs4_lease_time(void){return 0;}
 static inline void nfs4_reset_lease(time_t leasetime){}
+static inline int nfs4_reset_recoverydir(char *recdir) {return 0;}
 #endif
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From c988d2b2845495373f666a381d354a7f80981d62 Mon Sep 17 00:00:00 2001
From: Matt Domsch <Matt_Domsch@dell.com>
Date: Thu, 23 Jun 2005 22:05:15 -0700
Subject: [PATCH] modules: add version and srcversion to sysfs

This patch adds version and srcversion files to
/sys/module/${modulename} containing the version and srcversion fields
of the module's modinfo section (if present).

/sys/module/e1000
|-- srcversion
`-- version

This patch differs slightly from the version posted in January, as it
now uses the new kstrdup() call in -mm.

Why put this in sysfs?

a) Tools like DKMS, which deal with changing out individual kernel
   modules without replacing the whole kernel, can behave smarter if they
   can tell the version of a given module.  The autoinstaller feature, for
   example, which determines if your system has a "good" version of a
   driver (i.e.  if the one provided by DKMS has a newer verson than that
   provided by the kernel package installed), and to automatically compile
   and install a newer version if DKMS has it but your kernel doesn't yet
   have that version.

b) Because sysadmins manually, or with tools like DKMS, can switch out
   modules on the file system, you can't count on 'modinfo foo.ko', which
   looks at /lib/modules/${kernelver}/...  actually matching what is loaded
   into the kernel already.  Hence asking sysfs for this.

c) as the unbind-driver-from-device work takes shape, it will be
   possible to rebind a driver that's built-in (no .ko to modinfo for the
   version) to a newly loaded module.  sysfs will have the
   currently-built-in version info, for comparison.

d) tech support scripts can then easily grab the version info for what's
   running presently - a question I get often.

There has been renewed interest in this patch on linux-scsi by driver
authors.

As the idea originated from GregKH, I leave his Signed-off-by: intact,
though the implementation is nearly completely new.  Compiled and run on
x86 and x86_64.

From: Matthew Dobson <colpatch@us.ibm.com>

      build fix

From: Thierry Vignaud <tvignaud@mandriva.com>

      build fix

From: Matthew Dobson <colpatch@us.ibm.com>

      warning fix

Signed-off-by: Greg Kroah-Hartman <greg@kroah.com>
Signed-off-by: Matt Domsch <Matt_Domsch@dell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/module.h |  5 +++
 kernel/module.c        | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 0e432a0f4aee..f05372b7fe77 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -51,6 +51,9 @@ struct module_attribute {
         ssize_t (*show)(struct module_attribute *, struct module *, char *);
         ssize_t (*store)(struct module_attribute *, struct module *,
 			 const char *, size_t count);
+	void (*setup)(struct module *, const char *);
+	int (*test)(struct module *);
+	void (*free)(struct module *);
 };
 
 struct module_kobject
@@ -239,6 +242,8 @@ struct module
 	/* Sysfs stuff. */
 	struct module_kobject mkobj;
 	struct module_param_attrs *param_attrs;
+	const char *version;
+	const char *srcversion;
 
 	/* Exported symbols */
 	const struct kernel_symbol *syms;
diff --git a/kernel/module.c b/kernel/module.c
index a566745dde62..0494c89a0d26 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
 #include <linux/notifier.h>
 #include <linux/stop_machine.h>
 #include <linux/device.h>
+#include <linux/string.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_MODULE_UNLOAD
+#define MODINFO_ATTR(field)	\
+static void setup_modinfo_##field(struct module *mod, const char *s)  \
+{                                                                     \
+	mod->field = kstrdup(s, GFP_KERNEL);                          \
+}                                                                     \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
+	                struct module *mod, char *buffer)             \
+{                                                                     \
+	return sprintf(buffer, "%s\n", mod->field);                   \
+}                                                                     \
+static int modinfo_##field##_exists(struct module *mod)               \
+{                                                                     \
+	return mod->field != NULL;                                    \
+}                                                                     \
+static void free_modinfo_##field(struct module *mod)                  \
+{                                                                     \
+        kfree(mod->field);                                            \
+        mod->field = NULL;                                            \
+}                                                                     \
+static struct module_attribute modinfo_##field = {                    \
+	.attr = { .name = __stringify(field), .mode = 0444,           \
+		  .owner = THIS_MODULE },                             \
+	.show = show_modinfo_##field,                                 \
+	.setup = setup_modinfo_##field,                               \
+	.test = modinfo_##field##_exists,                             \
+	.free = free_modinfo_##field,                                 \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct module_attribute *modinfo_attrs[] = {
+	&modinfo_version,
+	&modinfo_srcversion,
+	NULL,
+};
+
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 {
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
 }
 #endif
 
+#ifdef CONFIG_MODULE_UNLOAD
+static int module_add_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	int error = 0;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+		if (!attr->test ||
+		    (attr->test && attr->test(mod)))
+			error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+	}
+	return error;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+		attr->free(mod);
+	}
+}
+#endif
 
 static int mod_sysfs_setup(struct module *mod,
 			   struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
 	if (err)
 		goto out_unreg;
 
+#ifdef CONFIG_MODULE_UNLOAD
+	err = module_add_modinfo_attrs(mod);
+	if (err)
+		goto out_unreg;
+#endif
+
 	return 0;
 
 out_unreg:
@@ -1066,6 +1136,9 @@ out:
 
 static void mod_kobject_remove(struct module *mod)
 {
+#ifdef CONFIG_MODULE_UNLOAD
+	module_remove_modinfo_attrs(mod);
+#endif
 	module_remove_refcnt_attr(mod);
 	module_param_sysfs_remove(mod);
 
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
 	return NULL;
 }
 
+#ifdef CONFIG_MODULE_UNLOAD
+static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
+			  unsigned int infoindex)
+{
+	struct module_attribute *attr;
+	int i;
+
+	for (i = 0; (attr = modinfo_attrs[i]); i++) {
+		if (attr->setup)
+			attr->setup(mod,
+				    get_modinfo(sechdrs,
+						infoindex,
+						attr->attr.name));
+	}
+}
+#endif
+
 #ifdef CONFIG_KALLSYMS
 int is_exported(const char *name, const struct module *mod)
 {
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
 	/* Set up license info based on the info section */
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
+#ifdef CONFIG_MODULE_UNLOAD
+	/* Set up MODINFO_ATTR fields */
+	setup_modinfo(mod, sechdrs, infoindex);
+#endif
+
 	/* Fix up syms, so that st_value is a pointer to location. */
 	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
 			       mod);
-- 
cgit v1.2.3-59-g8ed1b


From 420edbcc09008342c7b2665453f6b370739aadb0 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Thu, 23 Jun 2005 22:05:23 -0700
Subject: [PATCH] xip: bdev: execute in place

This is the block device related part.  The block device operation
direct_access now has a struct block_device as first parameter.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/s390/block/dcssblk.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/fs.h           |  1 +
 2 files changed, 41 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 16ab8d363ac6..6bc27d52326f 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -35,14 +35,17 @@
 static int dcssblk_open(struct inode *inode, struct file *filp);
 static int dcssblk_release(struct inode *inode, struct file *filp);
 static int dcssblk_make_request(struct request_queue *q, struct bio *bio);
+static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
+				 unsigned long *data);
 
 static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0";
 
 static int dcssblk_major;
 static struct block_device_operations dcssblk_devops = {
-	.owner   = THIS_MODULE,
-	.open    = dcssblk_open,
-	.release = dcssblk_release,
+	.owner   	= THIS_MODULE,
+	.open    	= dcssblk_open,
+	.release 	= dcssblk_release,
+	.direct_access 	= dcssblk_direct_access,
 };
 
 static ssize_t dcssblk_add_store(struct device * dev, struct device_attribute *attr, const char * buf,
@@ -641,6 +644,20 @@ dcssblk_make_request(request_queue_t *q, struct bio *bio)
 		/* Request beyond end of DCSS segment. */
 		goto fail;
 	}
+	/* verify data transfer direction */
+	if (dev_info->is_shared) {
+		switch (dev_info->segment_type) {
+		case SEG_TYPE_SR:
+		case SEG_TYPE_ER:
+		case SEG_TYPE_SC:
+			/* cannot write to these segments */
+			if (bio_data_dir(bio) == WRITE) {
+				PRINT_WARN("rejecting write to ro segment %s\n", dev_info->dev.bus_id);
+				goto fail;
+			}
+		}
+	}
+
 	index = (bio->bi_sector >> 3);
 	bio_for_each_segment(bvec, bio, i) {
 		page_addr = (unsigned long)
@@ -661,7 +678,26 @@ dcssblk_make_request(request_queue_t *q, struct bio *bio)
 	bio_endio(bio, bytes_done, 0);
 	return 0;
 fail:
-	bio_io_error(bio, bytes_done);
+	bio_io_error(bio, bio->bi_size);
+	return 0;
+}
+
+static int
+dcssblk_direct_access (struct block_device *bdev, sector_t secnum,
+			unsigned long *data)
+{
+	struct dcssblk_dev_info *dev_info;
+	unsigned long pgoff;
+
+	dev_info = bdev->bd_disk->private_data;
+	if (!dev_info)
+		return -ENODEV;
+	if (secnum % (PAGE_SIZE/512))
+		return -EINVAL;
+	pgoff = secnum / (PAGE_SIZE / 512);
+	if ((pgoff+1)*PAGE_SIZE-1 > dev_info->end - dev_info->start)
+		return -ERANGE;
+	*data = (unsigned long) (dev_info->start+pgoff*PAGE_SIZE);
 	return 0;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83857d8070d3..929bf8d20c87 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -885,6 +885,7 @@ struct block_device_operations {
 	int (*ioctl) (struct inode *, struct file *, unsigned, unsigned long);
 	long (*unlocked_ioctl) (struct file *, unsigned, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned, unsigned long);
+	int (*direct_access) (struct block_device *, sector_t, unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	struct module *owner;
-- 
cgit v1.2.3-59-g8ed1b


From ceffc078528befc008c6f2c2c4decda79eabd534 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Thu, 23 Jun 2005 22:05:25 -0700
Subject: [PATCH] xip: fs/mm: execute in place

- generic_file* file operations do no longer have a xip/non-xip split
- filemap_xip.c implements a new set of fops that require get_xip_page
  aop to work proper. all new fops are exported GPL-only (don't like to
  see whatever code use those except GPL modules)
- __xip_unmap now uses page_check_address, which is no longer static
  in rmap.c, and defined in linux/rmap.h
- mm/filemap.h is now much more clean, plainly having just Linus'
  inline funcs moved here from filemap.c
- fix includes in filemap_xip to make it build cleanly on i386

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/open.c            |   4 +-
 include/linux/fs.h   |  18 ++
 include/linux/rmap.h |   6 +
 mm/Makefile          |   1 +
 mm/filemap.c         |  74 +------
 mm/filemap.h         |  94 +++++++++
 mm/filemap_xip.c     | 581 +++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/rmap.c            |   4 +-
 8 files changed, 707 insertions(+), 75 deletions(-)
 create mode 100644 mm/filemap.h
 create mode 100644 mm/filemap_xip.c

(limited to 'include/linux')

diff --git a/fs/open.c b/fs/open.c
index 8ec63f735918..3f4a4286fdc4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -808,7 +808,9 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 
 	/* NB: we're sure to have correct a_ops only after f_op->open */
 	if (f->f_flags & O_DIRECT) {
-		if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) {
+		if (!f->f_mapping->a_ops ||
+		    ((!f->f_mapping->a_ops->direct_IO) &&
+		    (!f->f_mapping->a_ops->get_xip_page))) {
 			fput(f);
 			f = ERR_PTR(-EINVAL);
 		}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 929bf8d20c87..79c0fafc0211 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -330,6 +330,8 @@ struct address_space_operations {
 	int (*releasepage) (struct page *, int);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
+	struct page* (*get_xip_page)(struct address_space *, sector_t,
+			int);
 };
 
 struct backing_dev_info;
@@ -1497,6 +1499,22 @@ extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
+#ifdef CONFIG_FS_XIP
+extern ssize_t xip_file_aio_read(struct kiocb *iocb, char __user *buf,
+				 size_t count, loff_t pos);
+extern ssize_t xip_file_readv(struct file *filp, const struct iovec *iov,
+			      unsigned long nr_segs, loff_t *ppos);
+extern ssize_t xip_file_sendfile(struct file *in_file, loff_t *ppos,
+				 size_t count, read_actor_t actor,
+				 void *target);
+extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
+extern ssize_t xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
+				  size_t count, loff_t pos);
+extern ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
+			       unsigned long nr_segs, loff_t *ppos);
+extern int xip_truncate_page(struct address_space *mapping, loff_t from);
+#endif
+
 static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 					read_descriptor_t * desc,
 					read_actor_t actor)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 11b484e37ac9..e80fb7ee6efd 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -92,6 +92,12 @@ static inline void page_dup_rmap(struct page *page)
 int page_referenced(struct page *, int is_locked, int ignore_token);
 int try_to_unmap(struct page *);
 
+/*
+ * Called from mm/filemap_xip.c to unmap empty zero page
+ */
+pte_t *page_check_address(struct page *, struct mm_struct *, unsigned long);
+
+
 /*
  * Used by swapoff to help locate where page is expected in vma.
  */
diff --git a/mm/Makefile b/mm/Makefile
index 8f70ffd763c8..4cd69e3ce421 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -19,3 +19,4 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 
+obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/filemap.c b/mm/filemap.c
index a3598b542a31..7332194d7afd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -28,6 +28,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include "filemap.h"
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
@@ -1714,32 +1715,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
 
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-			const char __user *buf, unsigned bytes)
-{
-	char *kaddr;
-	int left;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-	kunmap_atomic(kaddr, KM_USER0);
-
-	if (left != 0) {
-		/* Do it the slow way */
-		kaddr = kmap(page);
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		kunmap(page);
-	}
-	return bytes - left;
-}
-
-static size_t
+size_t
 __filemap_copy_from_user_iovec(char *vaddr, 
 			const struct iovec *iov, size_t base, size_t bytes)
 {
@@ -1766,52 +1742,6 @@ __filemap_copy_from_user_iovec(char *vaddr,
 	return copied - left;
 }
 
-/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap_atomic(page, KM_USER0);
-	copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-						base, bytes);
-	kunmap_atomic(kaddr, KM_USER0);
-	if (copied != bytes) {
-		kaddr = kmap(page);
-		copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
-							base, bytes);
-		kunmap(page);
-	}
-	return copied;
-}
-
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-	const struct iovec *iov = *iovp;
-	size_t base = *basep;
-
-	while (bytes) {
-		int copy = min(bytes, iov->iov_len - base);
-
-		bytes -= copy;
-		base += copy;
-		if (iov->iov_len == base) {
-			iov++;
-			base = 0;
-		}
-	}
-	*iovp = iov;
-	*basep = base;
-}
-
 /*
  * Performs necessary checks before doing a write
  *
diff --git a/mm/filemap.h b/mm/filemap.h
new file mode 100644
index 000000000000..c2d0546a57eb
--- /dev/null
+++ b/mm/filemap.h
@@ -0,0 +1,94 @@
+/*
+ *	linux/mm/filemap.h
+ *
+ * Copyright (C) 1994-1999  Linus Torvalds
+ */
+
+#ifndef __FILEMAP_H
+#define __FILEMAP_H
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uio.h>
+#include <linux/config.h>
+#include <asm/uaccess.h>
+
+extern size_t
+__filemap_copy_from_user_iovec(char *vaddr,
+			       const struct iovec *iov,
+			       size_t base,
+			       size_t bytes);
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then clear the page
+ * out to (offset+bytes) and return the number of bytes which were copied.
+ */
+static inline size_t
+filemap_copy_from_user(struct page *page, unsigned long offset,
+			const char __user *buf, unsigned bytes)
+{
+	char *kaddr;
+	int left;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	if (left != 0) {
+		/* Do it the slow way */
+		kaddr = kmap(page);
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		kunmap(page);
+	}
+	return bytes - left;
+}
+
+/*
+ * This has the same sideeffects and return value as filemap_copy_from_user().
+ * The difference is that on a fault we need to memset the remainder of the
+ * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
+ * single-segment behaviour.
+ */
+static inline size_t
+filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+						base, bytes);
+	kunmap_atomic(kaddr, KM_USER0);
+	if (copied != bytes) {
+		kaddr = kmap(page);
+		copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+							base, bytes);
+		kunmap(page);
+	}
+	return copied;
+}
+
+static inline void
+filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	while (bytes) {
+		int copy = min(bytes, iov->iov_len - base);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	}
+	*iovp = iov;
+	*basep = base;
+}
+#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
new file mode 100644
index 000000000000..7d63acd48817
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,581 @@
+/*
+ *	linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <asm/tlbflush.h>
+#include "filemap.h"
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all.  It may be NULL.
+ */
+static void
+do_xip_mapping_read(struct address_space *mapping,
+		    struct file_ra_state *_ra,
+		    struct file *filp,
+		    loff_t *ppos,
+		    read_descriptor_t *desc,
+		    read_actor_t actor)
+{
+	struct inode *inode = mapping->host;
+	unsigned long index, end_index, offset;
+	loff_t isize;
+
+	BUG_ON(!mapping->a_ops->get_xip_page);
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		goto out;
+
+	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	for (;;) {
+		struct page *page;
+		unsigned long nr, ret;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = PAGE_CACHE_SIZE;
+		if (index >= end_index) {
+			if (index > end_index)
+				goto out;
+			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (nr <= offset) {
+				goto out;
+			}
+		}
+		nr = nr - offset;
+
+		page = mapping->a_ops->get_xip_page(mapping,
+			index*(PAGE_SIZE/512), 0);
+		if (!page)
+			goto no_xip_page;
+		if (unlikely(IS_ERR(page))) {
+			if (PTR_ERR(page) == -ENODATA) {
+				/* sparse */
+				page = virt_to_page(empty_zero_page);
+			} else {
+				desc->error = PTR_ERR(page);
+				goto out;
+			}
+		} else
+			BUG_ON(!PageUptodate(page));
+
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(mapping))
+			flush_dcache_page(page);
+
+		/*
+		 * Ok, we have the page, and it's up-to-date, so
+		 * now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		ret = actor(desc, page, offset, nr);
+		offset += ret;
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+
+		if (ret == nr && desc->count)
+			continue;
+		goto out;
+
+no_xip_page:
+		/* Did not get the page. Report it */
+		desc->error = -EIO;
+		goto out;
+	}
+
+out:
+	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+	if (filp)
+		file_accessed(filp);
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that uses the get_xip_page address space operation.
+ */
+static ssize_t
+__xip_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *filp = iocb->ki_filp;
+	ssize_t retval;
+	unsigned long seg;
+	size_t count;
+
+	count = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		count += iv->iov_len;
+		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		count -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	retval = 0;
+	if (count) {
+		for (seg = 0; seg < nr_segs; seg++) {
+			read_descriptor_t desc;
+
+			desc.written = 0;
+			desc.arg.buf = iov[seg].iov_base;
+			desc.count = iov[seg].iov_len;
+			if (desc.count == 0)
+				continue;
+			desc.error = 0;
+			do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+					    ppos, &desc, file_read_actor);
+			retval += desc.written;
+			if (!retval) {
+				retval = desc.error;
+				break;
+			}
+		}
+	}
+	return retval;
+}
+
+ssize_t
+xip_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
+		  loff_t pos)
+{
+	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
+
+	BUG_ON(iocb->ki_pos != pos);
+	return __xip_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
+}
+EXPORT_SYMBOL_GPL(xip_file_aio_read);
+
+ssize_t
+xip_file_readv(struct file *filp, const struct iovec *iov,
+	       unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+
+	init_sync_kiocb(&kiocb, filp);
+	return __xip_file_aio_read(&kiocb, iov, nr_segs, ppos);
+}
+EXPORT_SYMBOL_GPL(xip_file_readv);
+
+ssize_t
+xip_file_sendfile(struct file *in_file, loff_t *ppos,
+	     size_t count, read_actor_t actor, void *target)
+{
+	read_descriptor_t desc;
+
+	if (!count)
+		return 0;
+
+	desc.written = 0;
+	desc.count = count;
+	desc.arg.data = target;
+	desc.error = 0;
+
+	do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
+			    ppos, &desc, actor);
+	if (desc.written)
+		return desc.written;
+	return desc.error;
+}
+EXPORT_SYMBOL_GPL(xip_file_sendfile);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * empty_zero_page when found at pgoff. Should it go in rmap.c?
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+		     unsigned long pgoff)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct prio_tree_iter iter;
+	unsigned long address;
+	pte_t *pte;
+	pte_t pteval;
+
+	spin_lock(&mapping->i_mmap_lock);
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+		mm = vma->vm_mm;
+		address = vma->vm_start +
+			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+		/*
+		 * We need the page_table_lock to protect us from page faults,
+		 * munmap, fork, etc...
+		 */
+		pte = page_check_address(virt_to_page(empty_zero_page), mm,
+					 address);
+		if (!IS_ERR(pte)) {
+			/* Nuke the page table entry. */
+			flush_cache_page(vma, address, pte_pfn(pte));
+			pteval = ptep_clear_flush(vma, address, pte);
+			BUG_ON(pte_dirty(pteval));
+			pte_unmap(pte);
+			spin_unlock(&mm->page_table_lock);
+		}
+	}
+	spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * xip_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_nopage, but used for execute in place
+ */
+static struct page *
+xip_file_nopage(struct vm_area_struct * area,
+		   unsigned long address,
+		   int *type)
+{
+	struct file *file = area->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long size, pgoff, endoff;
+
+	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
+		+ area->vm_pgoff;
+	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
+		+ area->vm_pgoff;
+
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (pgoff >= size) {
+		return NULL;
+	}
+
+	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
+	if (!IS_ERR(page)) {
+		BUG_ON(!PageUptodate(page));
+		return page;
+	}
+	if (PTR_ERR(page) != -ENODATA)
+		return NULL;
+
+	/* sparse block */
+	if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+	    (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+		/* maybe shared writable, allocate new block */
+		page = mapping->a_ops->get_xip_page (mapping,
+			pgoff*(PAGE_SIZE/512), 1);
+		if (IS_ERR(page))
+			return NULL;
+		BUG_ON(!PageUptodate(page));
+		/* unmap page at pgoff from all other vmas */
+		__xip_unmap(mapping, pgoff);
+	} else {
+		/* not shared and writable, use empty_zero_page */
+		page = virt_to_page(empty_zero_page);
+	}
+
+	return page;
+}
+
+static struct vm_operations_struct xip_file_vm_ops = {
+	.nopage         = xip_file_nopage,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+
+	file_accessed(file);
+	vma->vm_ops = &xip_file_vm_ops;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
+		  unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		  size_t count)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space * mapping = file->f_mapping;
+	struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	struct page	*page;
+	size_t		bytes;
+	const struct iovec *cur_iov = iov; /* current iovec */
+	size_t		iov_base = 0;	   /* offset in the current iovec */
+	char __user	*buf;
+	ssize_t		written = 0;
+
+	BUG_ON(!mapping->a_ops->get_xip_page);
+
+	buf = iov->iov_base;
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 */
+		fault_in_pages_readable(buf, bytes);
+
+		page = a_ops->get_xip_page(mapping,
+						    index*(PAGE_SIZE/512), 0);
+		if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+			/* we allocate a new page unmap it */
+			page = a_ops->get_xip_page(mapping,
+				index*(PAGE_SIZE/512), 1);
+			if (!IS_ERR(page))
+			/* unmap page at pgoff from all other vmas */
+			__xip_unmap(mapping, index);
+
+		}
+
+		if (IS_ERR(page)) {
+			status = PTR_ERR(page);
+			break;
+		}
+
+		BUG_ON(!PageUptodate(page));
+
+		if (likely(nr_segs == 1))
+			copied = filemap_copy_from_user(page, offset,
+							buf, bytes);
+		else
+			copied = filemap_copy_from_user_iovec(page, offset,
+						cur_iov, iov_base, bytes);
+		flush_dcache_page(page);
+		if (likely(copied > 0)) {
+			status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+				if (unlikely(nr_segs > 1))
+					filemap_set_next_iovec(&cur_iov,
+							&iov_base, status);
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		if (status < 0)
+			break;
+	} while (count);
+	*ppos = pos;
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_sem.
+	 */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+
+	return written ? written : status;
+}
+
+static ssize_t
+xip_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
+				unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space * mapping = file->f_mapping;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	struct inode 	*inode = mapping->host;
+	unsigned long	seg;
+	loff_t		pos;
+	ssize_t		written;
+	ssize_t		err;
+
+	ocount = 0;
+	for (seg = 0; seg < nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		nr_segs = seg;
+		ocount -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	count = ocount;
+	pos = *ppos;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	written = 0;
+
+	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+	if (err)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	err = remove_suid(file->f_dentry);
+	if (err)
+		goto out;
+
+	inode_update_time(inode, 1);
+
+	/* use execute in place to copy directly to disk */
+	written = do_xip_file_write (iocb, iov,
+				  nr_segs, pos, ppos, count);
+ out:
+	return written ? written : err;
+}
+
+static ssize_t
+__xip_file_write_nolock(struct file *file, const struct iovec *iov,
+			unsigned long nr_segs, loff_t *ppos)
+{
+	struct kiocb kiocb;
+
+	init_sync_kiocb(&kiocb, file);
+	return xip_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
+}
+
+ssize_t
+xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
+		       size_t count, loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+	struct iovec local_iov = { .iov_base = (void __user *)buf,
+				   .iov_len = count };
+
+	BUG_ON(iocb->ki_pos != pos);
+
+	down(&inode->i_sem);
+	ret = xip_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+	up(&inode->i_sem);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_aio_write);
+
+ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
+			unsigned long nr_segs, loff_t *ppos)
+{
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	ssize_t ret;
+
+	down(&inode->i_sem);
+	ret = __xip_file_write_nolock(file, iov, nr_segs, ppos);
+	up(&inode->i_sem);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_writev);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_page
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	unsigned length;
+	struct page *page;
+	void *kaddr;
+	int err;
+
+	BUG_ON(!mapping->a_ops->get_xip_page);
+
+	blocksize = 1 << mapping->host->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+
+	page = mapping->a_ops->get_xip_page(mapping,
+					    index*(PAGE_SIZE/512), 0);
+	err = -ENOMEM;
+	if (!page)
+		goto out;
+	if (unlikely(IS_ERR(page))) {
+		if (PTR_ERR(page) == -ENODATA) {
+			/* Hole? No need to truncate */
+			return 0;
+		} else {
+			err = PTR_ERR(page);
+			goto out;
+		}
+	} else
+		BUG_ON(!PageUptodate(page));
+	kaddr = kmap_atomic(page, KM_USER0);
+	memset(kaddr + offset, 0, length);
+	kunmap_atomic(kaddr, KM_USER0);
+
+	flush_dcache_page(page);
+	err = 0;
+out:
+	return err;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 89770bd25f31..08ac5c7fa91f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -247,8 +247,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
  *
  * On success returns with mapped pte and locked mm->page_table_lock.
  */
-static pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-					unsigned long address)
+pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+			  unsigned long address)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-- 
cgit v1.2.3-59-g8ed1b


From 6d79125bba55ee82701f1c7d4ebbc1aa20ecbe4e Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Thu, 23 Jun 2005 22:05:26 -0700
Subject: [PATCH] xip: ext2: execute in place

These are the ext2 related parts.  Ext2 now uses the xip_* file operations
along with the get_xip_page aop when mounted with -o xip.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig              | 17 +++++++++++
 fs/ext2/Makefile        |  1 +
 fs/ext2/ext2.h          |  2 ++
 fs/ext2/file.c          | 18 +++++++++++
 fs/ext2/inode.c         | 31 ++++++++++++++++---
 fs/ext2/namei.c         | 12 ++++++--
 fs/ext2/super.c         | 27 ++++++++++++++++-
 fs/ext2/xip.c           | 80 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/xip.h           | 25 ++++++++++++++++
 include/linux/ext2_fs.h | 25 ++++++++--------
 include/linux/fs.h      |  5 ++++
 11 files changed, 223 insertions(+), 20 deletions(-)
 create mode 100644 fs/ext2/xip.c
 create mode 100644 fs/ext2/xip.h

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 5c704d05627a..8157f2e2d515 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,6 +50,23 @@ config EXT2_FS_SECURITY
 	  If you are not using a security module that requires using
 	  extended attributes for file security labels, say N.
 
+config EXT2_FS_XIP
+	bool "Ext2 execute in place support"
+	depends on EXT2_FS
+	help
+	  Execute in place can be used on memory-backed block devices. If you
+	  enable this option, you can select to mount block devices which are
+	  capable of this feature without using the page cache.
+
+	  If you do not use a block device that is capable of using this,
+	  or if unsure, say N.
+
+config FS_XIP
+# execute in place
+	bool
+	depends on EXT2_FS_XIP
+	default y
+
 config EXT3_FS
 	tristate "Ext3 journalling file system support"
 	help
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index ee240a14e70f..c5d02da73bc3 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,3 +10,4 @@ ext2-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
 ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
 ext2-$(CONFIG_EXT2_FS_SECURITY)	 += xattr_security.o
+ext2-$(CONFIG_EXT2_FS_XIP)	 += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8f0fd726c3f1..eed521d22cf0 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -147,9 +147,11 @@ extern struct file_operations ext2_dir_operations;
 /* file.c */
 extern struct inode_operations ext2_file_inode_operations;
 extern struct file_operations ext2_file_operations;
+extern struct file_operations ext2_xip_file_operations;
 
 /* inode.c */
 extern struct address_space_operations ext2_aops;
+extern struct address_space_operations ext2_aops_xip;
 extern struct address_space_operations ext2_nobh_aops;
 
 /* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index f5e86141ec54..2b3d572365af 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,6 +55,24 @@ struct file_operations ext2_file_operations = {
 	.sendfile	= generic_file_sendfile,
 };
 
+#ifdef CONFIG_EXT2_FS_XIP
+struct file_operations ext2_xip_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= xip_file_aio_read,
+	.aio_write	= xip_file_aio_write,
+	.ioctl		= ext2_ioctl,
+	.mmap		= xip_file_mmap,
+	.open		= generic_file_open,
+	.release	= ext2_release_file,
+	.fsync		= ext2_sync_file,
+	.readv		= xip_file_readv,
+	.writev		= xip_file_writev,
+	.sendfile	= xip_file_sendfile,
+};
+#endif
+
 struct inode_operations ext2_file_inode_operations = {
 	.truncate	= ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index a50d9db4b6e4..53dceb0c6593 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -33,6 +33,7 @@
 #include <linux/mpage.h>
 #include "ext2.h"
 #include "acl.h"
+#include "xip.h"
 
 MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
@@ -594,6 +595,16 @@ out:
 	if (err)
 		goto cleanup;
 
+	if (ext2_use_xip(inode->i_sb)) {
+		/*
+		 * we need to clear the block
+		 */
+		err = ext2_clear_xip_target (inode,
+			le32_to_cpu(chain[depth-1].key));
+		if (err)
+			goto cleanup;
+	}
+
 	if (ext2_splice_branch(inode, iblock, chain, partial, left) < 0)
 		goto changed;
 
@@ -691,6 +702,11 @@ struct address_space_operations ext2_aops = {
 	.writepages		= ext2_writepages,
 };
 
+struct address_space_operations ext2_aops_xip = {
+	.bmap			= ext2_bmap,
+	.get_xip_page		= ext2_get_xip_page,
+};
+
 struct address_space_operations ext2_nobh_aops = {
 	.readpage		= ext2_readpage,
 	.readpages		= ext2_readpages,
@@ -910,7 +926,9 @@ void ext2_truncate (struct inode * inode)
 	iblock = (inode->i_size + blocksize-1)
 					>> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 
-	if (test_opt(inode->i_sb, NOBH))
+	if (mapping_is_xip(inode->i_mapping))
+		xip_truncate_page(inode->i_mapping, inode->i_size);
+	else if (test_opt(inode->i_sb, NOBH))
 		nobh_truncate_page(inode->i_mapping, inode->i_size);
 	else
 		block_truncate_page(inode->i_mapping,
@@ -1110,11 +1128,16 @@ void ext2_read_inode (struct inode * inode)
 
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext2_file_inode_operations;
-		inode->i_fop = &ext2_file_operations;
-		if (test_opt(inode->i_sb, NOBH))
+		if (ext2_use_xip(inode->i_sb)) {
+			inode->i_mapping->a_ops = &ext2_aops_xip;
+			inode->i_fop = &ext2_xip_file_operations;
+		} else if (test_opt(inode->i_sb, NOBH)) {
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
-		else
+			inode->i_fop = &ext2_file_operations;
+		} else {
 			inode->i_mapping->a_ops = &ext2_aops;
+			inode->i_fop = &ext2_file_operations;
+		}
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext2_dir_inode_operations;
 		inode->i_fop = &ext2_dir_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3176b3d3ffa8..c5513953c825 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -34,6 +34,7 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "xip.h"
 
 /*
  * Couple of helper functions - make the code slightly cleaner.
@@ -127,11 +128,16 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, int mode, st
 	int err = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		inode->i_op = &ext2_file_inode_operations;
-		inode->i_fop = &ext2_file_operations;
-		if (test_opt(inode->i_sb, NOBH))
+		if (ext2_use_xip(inode->i_sb)) {
+			inode->i_mapping->a_ops = &ext2_aops_xip;
+			inode->i_fop = &ext2_xip_file_operations;
+		} else if (test_opt(inode->i_sb, NOBH)) {
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
-		else
+			inode->i_fop = &ext2_file_operations;
+		} else {
 			inode->i_mapping->a_ops = &ext2_aops;
+			inode->i_fop = &ext2_file_operations;
+		}
 		mark_inode_dirty(inode);
 		err = ext2_add_nondir(dentry, inode);
 	}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 661c3d98d946..876e391f2871 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,6 +31,7 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "xip.h"
 
 static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
@@ -257,7 +258,7 @@ enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
 	Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh,
-	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
+	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip,
 	Opt_ignore, Opt_err,
 };
 
@@ -286,6 +287,7 @@ static match_table_t tokens = {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
+	{Opt_xip, "xip"},
 	{Opt_ignore, "grpquota"},
 	{Opt_ignore, "noquota"},
 	{Opt_ignore, "quota"},
@@ -397,6 +399,13 @@ static int parse_options (char * options,
 			printk("EXT2 (no)acl options not supported\n");
 			break;
 #endif
+		case Opt_xip:
+#ifdef CONFIG_EXT2_FS_XIP
+			set_opt (sbi->s_mount_opt, XIP);
+#else
+			printk("EXT2 xip option not supported\n");
+#endif
+			break;
 		case Opt_ignore:
 			break;
 		default:
@@ -640,6 +649,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
 		 MS_POSIXACL : 0);
 
+	ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
+				    EXT2_MOUNT_XIP if not */
+
 	if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
 	    (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -668,6 +680,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
+	if ((ext2_use_xip(sb)) && ((blocksize != PAGE_SIZE) ||
+				  (sb->s_blocksize != blocksize))) {
+		if (!silent)
+			printk("XIP: Unsupported blocksize\n");
+		goto failed_mount;
+	}
+
 	/* If the blocksize doesn't match, re-read the thing.. */
 	if (sb->s_blocksize != blocksize) {
 		brelse(bh);
@@ -916,6 +935,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 {
 	struct ext2_sb_info * sbi = EXT2_SB(sb);
 	struct ext2_super_block * es;
+	unsigned long old_mount_opt = sbi->s_mount_opt;
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
@@ -927,6 +947,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
 
 	es = sbi->s_es;
+	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
+	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
+	    invalidate_inodes(sb))
+		ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\
+			     "xip remain in cache (no functional problem)");
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (*flags & MS_RDONLY) {
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
new file mode 100644
index 000000000000..d44431d1a338
--- /dev/null
+++ b/fs/ext2/xip.c
@@ -0,0 +1,80 @@
+/*
+ *  linux/fs/ext2/xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte (cotte@de.ibm.com)
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/buffer_head.h>
+#include <linux/ext2_fs_sb.h>
+#include <linux/ext2_fs.h>
+#include "ext2.h"
+#include "xip.h"
+
+static inline int
+__inode_direct_access(struct inode *inode, sector_t sector, unsigned long *data) {
+	BUG_ON(!inode->i_sb->s_bdev->bd_disk->fops->direct_access);
+	return inode->i_sb->s_bdev->bd_disk->fops
+		->direct_access(inode->i_sb->s_bdev,sector,data);
+}
+
+int
+ext2_clear_xip_target(struct inode *inode, int block) {
+	sector_t sector = block*(PAGE_SIZE/512);
+	unsigned long data;
+	int rc;
+
+	rc = __inode_direct_access(inode, sector, &data);
+	if (rc)
+		return rc;
+	clear_page((void*)data);
+	return 0;
+}
+
+void ext2_xip_verify_sb(struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	if ((sbi->s_mount_opt & EXT2_MOUNT_XIP)) {
+		if ((sb->s_bdev == NULL) ||
+			sb->s_bdev->bd_disk == NULL ||
+			sb->s_bdev->bd_disk->fops == NULL ||
+			sb->s_bdev->bd_disk->fops->direct_access == NULL) {
+			sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
+			ext2_warning(sb, __FUNCTION__,
+				"ignoring xip option - not supported by bdev");
+		}
+	}
+}
+
+struct page*
+ext2_get_xip_page(struct address_space *mapping, sector_t blockno,
+		   int create)
+{
+	int rc;
+	unsigned long data;
+	struct buffer_head tmp;
+
+	tmp.b_state = 0;
+	tmp.b_blocknr = 0;
+	rc = ext2_get_block(mapping->host, blockno/(PAGE_SIZE/512) , &tmp,
+				create);
+	if (rc)
+		return ERR_PTR(rc);
+	if (tmp.b_blocknr == 0) {
+		/* SPARSE block */
+		BUG_ON(create);
+		return ERR_PTR(-ENODATA);
+	}
+
+	rc = __inode_direct_access
+		(mapping->host,tmp.b_blocknr*(PAGE_SIZE/512) ,&data);
+	if (rc)
+		return ERR_PTR(rc);
+
+	SetPageUptodate(virt_to_page(data));
+	return virt_to_page(data);
+}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
new file mode 100644
index 000000000000..aa85331d6c56
--- /dev/null
+++ b/fs/ext2/xip.h
@@ -0,0 +1,25 @@
+/*
+ *  linux/fs/ext2/xip.h
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte (cotte@de.ibm.com)
+ */
+
+#ifdef CONFIG_EXT2_FS_XIP
+extern void ext2_xip_verify_sb (struct super_block *);
+extern int ext2_clear_xip_target (struct inode *, int);
+
+static inline int ext2_use_xip (struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
+}
+struct page* ext2_get_xip_page (struct address_space *, sector_t, int);
+#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_page)
+#else
+#define mapping_is_xip(map)			0
+#define ext2_xip_verify_sb(sb)			do { } while (0)
+#define ext2_use_xip(sb)			0
+#define ext2_clear_xip_target(inode, chain)	0
+#define ext2_get_xip_page			NULL
+#endif
diff --git a/include/linux/ext2_fs.h b/include/linux/ext2_fs.h
index fab43527e597..a657130ba03a 100644
--- a/include/linux/ext2_fs.h
+++ b/include/linux/ext2_fs.h
@@ -300,18 +300,19 @@ struct ext2_inode {
 /*
  * Mount flags
  */
-#define EXT2_MOUNT_CHECK		0x0001	/* Do mount-time checks */
-#define EXT2_MOUNT_OLDALLOC		0x0002  /* Don't use the new Orlov allocator */
-#define EXT2_MOUNT_GRPID		0x0004	/* Create files with directory's group */
-#define EXT2_MOUNT_DEBUG		0x0008	/* Some debugging messages */
-#define EXT2_MOUNT_ERRORS_CONT		0x0010	/* Continue on errors */
-#define EXT2_MOUNT_ERRORS_RO		0x0020	/* Remount fs ro on errors */
-#define EXT2_MOUNT_ERRORS_PANIC		0x0040	/* Panic on errors */
-#define EXT2_MOUNT_MINIX_DF		0x0080	/* Mimics the Minix statfs */
-#define EXT2_MOUNT_NOBH			0x0100	/* No buffer_heads */
-#define EXT2_MOUNT_NO_UID32		0x0200  /* Disable 32-bit UIDs */
-#define EXT2_MOUNT_XATTR_USER		0x4000	/* Extended user attributes */
-#define EXT2_MOUNT_POSIX_ACL		0x8000	/* POSIX Access Control Lists */
+#define EXT2_MOUNT_CHECK		0x000001  /* Do mount-time checks */
+#define EXT2_MOUNT_OLDALLOC		0x000002  /* Don't use the new Orlov allocator */
+#define EXT2_MOUNT_GRPID		0x000004  /* Create files with directory's group */
+#define EXT2_MOUNT_DEBUG		0x000008  /* Some debugging messages */
+#define EXT2_MOUNT_ERRORS_CONT		0x000010  /* Continue on errors */
+#define EXT2_MOUNT_ERRORS_RO		0x000020  /* Remount fs ro on errors */
+#define EXT2_MOUNT_ERRORS_PANIC		0x000040  /* Panic on errors */
+#define EXT2_MOUNT_MINIX_DF		0x000080  /* Mimics the Minix statfs */
+#define EXT2_MOUNT_NOBH			0x000100  /* No buffer_heads */
+#define EXT2_MOUNT_NO_UID32		0x000200  /* Disable 32-bit UIDs */
+#define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
+#define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
+#define EXT2_MOUNT_XIP			0x010000  /* Execute in place */
 
 #define clear_opt(o, opt)		o &= ~EXT2_MOUNT_##opt
 #define set_opt(o, opt)			o |= EXT2_MOUNT_##opt
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 79c0fafc0211..7e0501895f35 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1513,6 +1513,11 @@ extern ssize_t xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
 extern ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
 			       unsigned long nr_segs, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
+#else
+static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+	return 0;
+}
 #endif
 
 static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
-- 
cgit v1.2.3-59-g8ed1b


From eb6fe0c388e43b02e261f0fdee60e42f6298d7f7 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Thu, 23 Jun 2005 22:05:28 -0700
Subject: [PATCH] xip: reduce code duplication

This patch reworks filemap_xip.c with the goal to reduce code duplication
from mm/filemap.c.  It applies agains 2.6.12-rc6-mm1.  Instead of
implementing the aio functions, this one implements the synchronous
read/write functions only.  For readv and writev, the generic fallback is
used.  For aio, we rely on the application doing the fallback.  Since our
"synchronous" function does memcpy immediately anyway, there is no
performance difference between using the fallbacks or implementing each
operation.

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/file.c     |   8 +-
 include/linux/fs.h |  12 +--
 mm/filemap.h       |   2 +-
 mm/filemap_xip.c   | 246 ++++++++++++-----------------------------------------
 4 files changed, 63 insertions(+), 205 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b3d572365af..a484412fc782 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -58,17 +58,13 @@ struct file_operations ext2_file_operations = {
 #ifdef CONFIG_EXT2_FS_XIP
 struct file_operations ext2_xip_file_operations = {
 	.llseek		= generic_file_llseek,
-	.read		= do_sync_read,
-	.write		= do_sync_write,
-	.aio_read	= xip_file_aio_read,
-	.aio_write	= xip_file_aio_write,
+	.read		= xip_file_read,
+	.write		= xip_file_write,
 	.ioctl		= ext2_ioctl,
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_sync_file,
-	.readv		= xip_file_readv,
-	.writev		= xip_file_writev,
 	.sendfile	= xip_file_sendfile,
 };
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7e0501895f35..3ae8e37bdfc8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1500,18 +1500,14 @@ extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 
 #ifdef CONFIG_FS_XIP
-extern ssize_t xip_file_aio_read(struct kiocb *iocb, char __user *buf,
-				 size_t count, loff_t pos);
-extern ssize_t xip_file_readv(struct file *filp, const struct iovec *iov,
-			      unsigned long nr_segs, loff_t *ppos);
+extern ssize_t xip_file_read(struct file *filp, char __user *buf, size_t len,
+			     loff_t *ppos);
 extern ssize_t xip_file_sendfile(struct file *in_file, loff_t *ppos,
 				 size_t count, read_actor_t actor,
 				 void *target);
 extern int xip_file_mmap(struct file * file, struct vm_area_struct * vma);
-extern ssize_t xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
-				  size_t count, loff_t pos);
-extern ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
-			       unsigned long nr_segs, loff_t *ppos);
+extern ssize_t xip_file_write(struct file *filp, const char __user *buf,
+			      size_t len, loff_t *ppos);
 extern int xip_truncate_page(struct address_space *mapping, loff_t from);
 #else
 static inline int xip_truncate_page(struct address_space *mapping, loff_t from)
diff --git a/mm/filemap.h b/mm/filemap.h
index c2d0546a57eb..13793ba0ce17 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -15,7 +15,7 @@
 #include <linux/config.h>
 #include <asm/uaccess.h>
 
-extern size_t
+size_t
 __filemap_copy_from_user_iovec(char *vaddr,
 			       const struct iovec *iov,
 			       size_t base,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 7d63acd48817..3b6e384b98a6 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -114,83 +114,28 @@ out:
 		file_accessed(filp);
 }
 
-/*
- * This is the "read()" routine for all filesystems
- * that uses the get_xip_page address space operation.
- */
-static ssize_t
-__xip_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t *ppos)
-{
-	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg;
-	size_t count;
-
-	count = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
-
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		count += iv->iov_len;
-		if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		count -= iv->iov_len;	/* This segment is no good */
-		break;
-	}
-
-	retval = 0;
-	if (count) {
-		for (seg = 0; seg < nr_segs; seg++) {
-			read_descriptor_t desc;
-
-			desc.written = 0;
-			desc.arg.buf = iov[seg].iov_base;
-			desc.count = iov[seg].iov_len;
-			if (desc.count == 0)
-				continue;
-			desc.error = 0;
-			do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-					    ppos, &desc, file_read_actor);
-			retval += desc.written;
-			if (!retval) {
-				retval = desc.error;
-				break;
-			}
-		}
-	}
-	return retval;
-}
-
 ssize_t
-xip_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
-		  loff_t pos)
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
-	struct iovec local_iov = { .iov_base = buf, .iov_len = count };
+	read_descriptor_t desc;
 
-	BUG_ON(iocb->ki_pos != pos);
-	return __xip_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
-}
-EXPORT_SYMBOL_GPL(xip_file_aio_read);
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
 
-ssize_t
-xip_file_readv(struct file *filp, const struct iovec *iov,
-	       unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
+	desc.written = 0;
+	desc.arg.buf = buf;
+	desc.count = len;
+	desc.error = 0;
 
-	init_sync_kiocb(&kiocb, filp);
-	return __xip_file_aio_read(&kiocb, iov, nr_segs, ppos);
+	do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+			    ppos, &desc, file_read_actor);
+
+	if (desc.written)
+		return desc.written;
+	else
+		return desc.error;
 }
-EXPORT_SYMBOL_GPL(xip_file_readv);
+EXPORT_SYMBOL_GPL(xip_file_read);
 
 ssize_t
 xip_file_sendfile(struct file *in_file, loff_t *ppos,
@@ -326,25 +271,19 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 EXPORT_SYMBOL_GPL(xip_file_mmap);
 
 static ssize_t
-do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
-		  unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		  size_t count)
+__xip_file_write(struct file *filp, const char __user *buf,
+		  size_t count, loff_t pos, loff_t *ppos)
 {
-	struct file *file = iocb->ki_filp;
-	struct address_space * mapping = file->f_mapping;
+	struct address_space * mapping = filp->f_mapping;
 	struct address_space_operations *a_ops = mapping->a_ops;
 	struct inode 	*inode = mapping->host;
 	long		status = 0;
 	struct page	*page;
 	size_t		bytes;
-	const struct iovec *cur_iov = iov; /* current iovec */
-	size_t		iov_base = 0;	   /* offset in the current iovec */
-	char __user	*buf;
 	ssize_t		written = 0;
 
 	BUG_ON(!mapping->a_ops->get_xip_page);
 
-	buf = iov->iov_base;
 	do {
 		unsigned long index;
 		unsigned long offset;
@@ -365,15 +304,14 @@ do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
 		fault_in_pages_readable(buf, bytes);
 
 		page = a_ops->get_xip_page(mapping,
-						    index*(PAGE_SIZE/512), 0);
+					   index*(PAGE_SIZE/512), 0);
 		if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
 			/* we allocate a new page unmap it */
 			page = a_ops->get_xip_page(mapping,
-				index*(PAGE_SIZE/512), 1);
+						   index*(PAGE_SIZE/512), 1);
 			if (!IS_ERR(page))
-			/* unmap page at pgoff from all other vmas */
-			__xip_unmap(mapping, index);
-
+				/* unmap page at pgoff from all other vmas */
+				__xip_unmap(mapping, index);
 		}
 
 		if (IS_ERR(page)) {
@@ -383,12 +321,7 @@ do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
 
 		BUG_ON(!PageUptodate(page));
 
-		if (likely(nr_segs == 1))
-			copied = filemap_copy_from_user(page, offset,
-							buf, bytes);
-		else
-			copied = filemap_copy_from_user_iovec(page, offset,
-						cur_iov, iov_base, bytes);
+		copied = filemap_copy_from_user(page, offset, buf, bytes);
 		flush_dcache_page(page);
 		if (likely(copied > 0)) {
 			status = copied;
@@ -398,9 +331,6 @@ do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
 				count -= status;
 				pos += status;
 				buf += status;
-				if (unlikely(nr_segs > 1))
-					filemap_set_next_iovec(&cur_iov,
-							&iov_base, status);
 			}
 		}
 		if (unlikely(copied != bytes))
@@ -422,110 +352,52 @@ do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
 	return written ? written : status;
 }
 
-static ssize_t
-xip_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
-				unsigned long nr_segs, loff_t *ppos)
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+	       loff_t *ppos)
 {
-	struct file *file = iocb->ki_filp;
-	struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
-	size_t count;		/* after file limit checks */
-	struct inode 	*inode = mapping->host;
-	unsigned long	seg;
-	loff_t		pos;
-	ssize_t		written;
-	ssize_t		err;
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count;
+	loff_t pos;
+	ssize_t ret;
 
-	ocount = 0;
-	for (seg = 0; seg < nr_segs; seg++) {
-		const struct iovec *iv = &iov[seg];
+	down(&inode->i_sem);
 
-		/*
-		 * If any segment has a negative length, or the cumulative
-		 * length ever wraps negative then return -EINVAL.
-		 */
-		ocount += iv->iov_len;
-		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-			return -EINVAL;
-		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-			continue;
-		if (seg == 0)
-			return -EFAULT;
-		nr_segs = seg;
-		ocount -= iv->iov_len;	/* This segment is no good */
-		break;
+	if (!access_ok(VERIFY_READ, buf, len)) {
+		ret=-EFAULT;
+		goto out_up;
 	}
 
-	count = ocount;
 	pos = *ppos;
+	count = len;
 
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
-	written = 0;
-
-	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-	if (err)
-		goto out;
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
 
+	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out_backing;
 	if (count == 0)
-		goto out;
+		goto out_backing;
 
-	err = remove_suid(file->f_dentry);
-	if (err)
-		goto out;
+	ret = remove_suid(filp->f_dentry);
+	if (ret)
+		goto out_backing;
 
 	inode_update_time(inode, 1);
 
-	/* use execute in place to copy directly to disk */
-	written = do_xip_file_write (iocb, iov,
-				  nr_segs, pos, ppos, count);
- out:
-	return written ? written : err;
-}
-
-static ssize_t
-__xip_file_write_nolock(struct file *file, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct kiocb kiocb;
-
-	init_sync_kiocb(&kiocb, file);
-	return xip_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
-}
-
-ssize_t
-xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
-		       size_t count, loff_t pos)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-	struct iovec local_iov = { .iov_base = (void __user *)buf,
-				   .iov_len = count };
+	ret = __xip_file_write (filp, buf, count, pos, ppos);
 
-	BUG_ON(iocb->ki_pos != pos);
-
-	down(&inode->i_sem);
-	ret = xip_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
+ out_backing:
+	current->backing_dev_info = NULL;
+ out_up:
 	up(&inode->i_sem);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(xip_file_aio_write);
-
-ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
-			unsigned long nr_segs, loff_t *ppos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret;
-
-	down(&inode->i_sem);
-	ret = __xip_file_write_nolock(file, iov, nr_segs, ppos);
-	up(&inode->i_sem);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_writev);
+EXPORT_SYMBOL_GPL(xip_file_write);
 
 /*
  * truncate a page used for execute in place
@@ -541,7 +413,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 	unsigned length;
 	struct page *page;
 	void *kaddr;
-	int err;
 
 	BUG_ON(!mapping->a_ops->get_xip_page);
 
@@ -556,17 +427,14 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 
 	page = mapping->a_ops->get_xip_page(mapping,
 					    index*(PAGE_SIZE/512), 0);
-	err = -ENOMEM;
 	if (!page)
-		goto out;
+		return -ENOMEM;
 	if (unlikely(IS_ERR(page))) {
-		if (PTR_ERR(page) == -ENODATA) {
+		if (PTR_ERR(page) == -ENODATA)
 			/* Hole? No need to truncate */
 			return 0;
-		} else {
-			err = PTR_ERR(page);
-			goto out;
-		}
+		else
+			return PTR_ERR(page);
 	} else
 		BUG_ON(!PageUptodate(page));
 	kaddr = kmap_atomic(page, KM_USER0);
@@ -574,8 +442,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
 	kunmap_atomic(kaddr, KM_USER0);
 
 	flush_dcache_page(page);
-	err = 0;
-out:
-	return err;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
-- 
cgit v1.2.3-59-g8ed1b


From 52c1da39534fb382c061de58b65f678ad74b59f5 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Thu, 23 Jun 2005 22:05:33 -0700
Subject: [PATCH] make various thing static

Another rollup of patches which give various symbols static scope

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/media/common/saa7146_fops.c |  2 +-
 drivers/media/video/tvaudio.c       | 22 +++++++++++-----------
 drivers/scsi/hosts.c                |  2 +-
 drivers/scsi/scsi.c                 |  6 ++++--
 drivers/scsi/scsi_debug.c           |  2 +-
 drivers/scsi/scsi_lib.c             |  2 +-
 drivers/scsi/scsi_priv.h            |  4 ----
 drivers/scsi/scsi_sysfs.c           |  4 ++--
 fs/namespace.c                      |  2 +-
 fs/reiserfs/stree.c                 |  2 +-
 include/linux/irq.h                 |  1 -
 include/linux/namespace.h           |  1 -
 include/net/sctp/sm.h               |  6 ------
 kernel/irq/spurious.c               |  2 +-
 kernel/module.c                     |  2 +-
 kernel/power/swsusp.c               |  2 +-
 net/sctp/sm_statefuns.c             | 16 ++++++++++++++--
 17 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/common/saa7146_fops.c b/drivers/media/common/saa7146_fops.c
index cb826c9adfe7..c04fd11526e0 100644
--- a/drivers/media/common/saa7146_fops.c
+++ b/drivers/media/common/saa7146_fops.c
@@ -403,7 +403,7 @@ static struct file_operations video_fops =
 	.llseek		= no_llseek,
 };
 
-void vv_callback(struct saa7146_dev *dev, unsigned long status)
+static void vv_callback(struct saa7146_dev *dev, unsigned long status)
 {
 	u32 isr = status;
 	
diff --git a/drivers/media/video/tvaudio.c b/drivers/media/video/tvaudio.c
index 5430b25b910d..9a493bea76d8 100644
--- a/drivers/media/video/tvaudio.c
+++ b/drivers/media/video/tvaudio.c
@@ -1236,17 +1236,17 @@ static int ta8874z_checkit(struct CHIPSTATE *chip)
 /* audio chip descriptions - struct CHIPDESC                              */
 
 /* insmod options to enable/disable individual audio chips */
-int tda8425  = 1;
-int tda9840  = 1;
-int tda9850  = 1;
-int tda9855  = 1;
-int tda9873  = 1;
-int tda9874a = 1;
-int tea6300  = 0;  // address clash with msp34xx
-int tea6320  = 0;  // address clash with msp34xx
-int tea6420  = 1;
-int pic16c54 = 1;
-int ta8874z  = 0;  // address clash with tda9840
+static int tda8425  = 1;
+static int tda9840  = 1;
+static int tda9850  = 1;
+static int tda9855  = 1;
+static int tda9873  = 1;
+static int tda9874a = 1;
+static int tea6300  = 0;  // address clash with msp34xx
+static int tea6320  = 0;  // address clash with msp34xx
+static int tea6420  = 1;
+static int pic16c54 = 1;
+static int ta8874z  = 0;  // address clash with tda9840
 
 module_param(tda8425, int, 0444);
 module_param(tda9840, int, 0444);
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c
index ba347576d99b..d7a38b6713f9 100644
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -56,7 +56,7 @@ static struct class shost_class = {
  * @shost:	pointer to struct Scsi_Host
  * recovery:	recovery requested to run.
  **/
-void scsi_host_cancel(struct Scsi_Host *shost, int recovery)
+static void scsi_host_cancel(struct Scsi_Host *shost, int recovery)
 {
 	struct scsi_device *sdev;
 
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 5578ae9a9e45..1cb5f7d4f278 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -68,6 +68,8 @@
 #include "scsi_priv.h"
 #include "scsi_logging.h"
 
+static void scsi_done(struct scsi_cmnd *cmd);
+static int scsi_retry_command(struct scsi_cmnd *cmd);
 
 /*
  * Definitions and constants.
@@ -741,7 +743,7 @@ static DEFINE_PER_CPU(struct list_head, scsi_done_q);
  *
  * This function is interrupt context safe.
  */
-void scsi_done(struct scsi_cmnd *cmd)
+static void scsi_done(struct scsi_cmnd *cmd)
 {
 	/*
 	 * We don't have to worry about this one timing out any more.
@@ -836,7 +838,7 @@ static void scsi_softirq(struct softirq_action *h)
  *              level drivers should not become re-entrant as a result of
  *              this.
  */
-int scsi_retry_command(struct scsi_cmnd *cmd)
+static int scsi_retry_command(struct scsi_cmnd *cmd)
 {
 	/*
 	 * Restore the SCSI command state.
diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c
index e0208886b45e..322b5a41a36f 100644
--- a/drivers/scsi/scsi_debug.c
+++ b/drivers/scsi/scsi_debug.c
@@ -1783,7 +1783,7 @@ static void __exit scsi_debug_exit(void)
 device_initcall(scsi_debug_init);
 module_exit(scsi_debug_exit);
 
-void pseudo_0_release(struct device * dev)
+static void pseudo_0_release(struct device * dev)
 {
 	if (SCSI_DEBUG_OPT_NOISE & scsi_debug_opts)
 		printk(KERN_INFO "scsi_debug: pseudo_0_release() called\n");
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9f996499fa9d..621dee8b8cb2 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -44,7 +44,7 @@ struct scsi_host_sg_pool {
 #endif
 
 #define SP(x) { x, "sgpool-" #x } 
-struct scsi_host_sg_pool scsi_sg_pools[] = { 
+static struct scsi_host_sg_pool scsi_sg_pools[] = {
 	SP(8),
 	SP(16),
 	SP(32),
diff --git a/drivers/scsi/scsi_priv.h b/drivers/scsi/scsi_priv.h
index c01580df4476..96d4f745975c 100644
--- a/drivers/scsi/scsi_priv.h
+++ b/drivers/scsi/scsi_priv.h
@@ -61,8 +61,6 @@ extern void scsi_exit_hosts(void);
 extern int scsi_dispatch_cmd(struct scsi_cmnd *cmd);
 extern int scsi_setup_command_freelist(struct Scsi_Host *shost);
 extern void scsi_destroy_command_freelist(struct Scsi_Host *shost);
-extern void scsi_done(struct scsi_cmnd *cmd);
-extern int scsi_retry_command(struct scsi_cmnd *cmd);
 extern int scsi_insert_special_req(struct scsi_request *sreq, int);
 extern void scsi_init_cmd_from_req(struct scsi_cmnd *cmd,
 		struct scsi_request *sreq);
@@ -136,7 +134,6 @@ extern void scsi_exit_sysctl(void);
 #endif /* CONFIG_SYSCTL */
 
 /* scsi_sysfs.c */
-extern void scsi_device_dev_release(struct device *);
 extern int scsi_sysfs_add_sdev(struct scsi_device *);
 extern int scsi_sysfs_add_host(struct Scsi_Host *);
 extern int scsi_sysfs_register(void);
@@ -145,7 +142,6 @@ extern void scsi_sysfs_device_initialize(struct scsi_device *);
 extern int scsi_sysfs_target_initialize(struct scsi_device *);
 extern struct scsi_transport_template blank_transport_template;
 
-extern struct class sdev_class;
 extern struct bus_type scsi_bus_type;
 
 /* 
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
index 93b41100a6d8..beed7fbe1cbe 100644
--- a/drivers/scsi/scsi_sysfs.c
+++ b/drivers/scsi/scsi_sysfs.c
@@ -150,7 +150,7 @@ static void scsi_device_cls_release(struct class_device *class_dev)
 	put_device(&sdev->sdev_gendev);
 }
 
-void scsi_device_dev_release(struct device *dev)
+static void scsi_device_dev_release(struct device *dev)
 {
 	struct scsi_device *sdev;
 	struct device *parent;
@@ -185,7 +185,7 @@ void scsi_device_dev_release(struct device *dev)
 		put_device(parent);
 }
 
-struct class sdev_class = {
+static struct class sdev_class = {
 	.name		= "scsi_device",
 	.release	= scsi_device_cls_release,
 };
diff --git a/fs/namespace.c b/fs/namespace.c
index 3b93e5d750eb..208c079e9fdb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -337,7 +337,7 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-void umount_tree(struct vfsmount *mnt)
+static void umount_tree(struct vfsmount *mnt)
 {
 	struct vfsmount *p;
 	LIST_HEAD(kill);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index c47f8fd31a2d..63158491e152 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -223,7 +223,7 @@ extern struct tree_balance * cur_tb;
 const struct reiserfs_key  MIN_KEY = {0, 0, {{0, 0},}};
 
 /* Maximal possible key. It is never in the tree. */
-const struct reiserfs_key  MAX_KEY = {
+static const struct reiserfs_key  MAX_KEY = {
 	__constant_cpu_to_le32(0xffffffff),
 	__constant_cpu_to_le32(0xffffffff),
 	{{__constant_cpu_to_le32(0xffffffff),
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 7fc1022be9ee..12277799c007 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -88,7 +88,6 @@ extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				       struct irqaction *action);
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc, int action_ret);
-extern void report_bad_irq(unsigned int irq, irq_desc_t *desc, int action_ret);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 9eca1558d72f..697991b69f9b 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -12,7 +12,6 @@ struct namespace {
 	struct rw_semaphore	sem;
 };
 
-extern void umount_tree(struct vfsmount *);
 extern int copy_namespace(int, struct task_struct *);
 extern void __put_namespace(struct namespace *namespace);
 
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index a53e08a45e32..88d9fe5975d5 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -131,7 +131,6 @@ sctp_state_fn_t sctp_sf_do_ecne;
 sctp_state_fn_t sctp_sf_ootb;
 sctp_state_fn_t sctp_sf_pdiscard;
 sctp_state_fn_t sctp_sf_violation;
-sctp_state_fn_t sctp_sf_violation_chunklen;
 sctp_state_fn_t sctp_sf_discard_chunk;
 sctp_state_fn_t sctp_sf_do_5_2_1_siminit;
 sctp_state_fn_t sctp_sf_do_5_2_2_dupinit;
@@ -259,11 +258,6 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
-					   __u16 error,
-					   const struct sctp_association *asoc,
-					   struct sctp_transport *transport);
-
 /* Prototypes for statetable processing. */
 
 int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..ba039e827d58 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 	}
 }
 
-void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 {
 	static int count = 100;
 
diff --git a/kernel/module.c b/kernel/module.c
index 0494c89a0d26..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -730,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
 	return 0;
 }
 
-int set_obsolete(const char *val, struct kernel_param *kp)
+static int set_obsolete(const char *val, struct kernel_param *kp)
 {
 	unsigned int min, max;
 	unsigned int size, maxsize;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..53f9f8720ee4 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -81,7 +81,7 @@ static int nr_copy_pages_check;
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
+static unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 058189684c7c..86073df418f5 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep,
 					     sctp_cmd_seq_t *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+					   __u16 error,
+					   const struct sctp_association *asoc,
+					   struct sctp_transport *transport);
+
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
+				     const struct sctp_association *asoc,
+				     const sctp_subtype_t type,
+				     void *arg,
+				     sctp_cmd_seq_t *commands);
 
 /* Small helper function that checks if the chunk length
  * is of the appropriate length.  The 'required_length' argument
@@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep,
  *
  * This is common code called by several sctp_sf_*_abort() functions above.
  */
-sctp_disposition_t  sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
+static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands,
 					   __u16 error,
 					   const struct sctp_association *asoc,
 					   struct sctp_transport *transport)
@@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep,
  *
  * Generate an  ABORT chunk and terminate the association.
  */
-sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep,
+static sctp_disposition_t sctp_sf_violation_chunklen(
+				     const struct sctp_endpoint *ep,
 				     const struct sctp_association *asoc,
 				     const sctp_subtype_t type,
 				     void *arg,
-- 
cgit v1.2.3-59-g8ed1b


From 75043cb5b386e5a01fd03b88f647dd992de02f97 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@mail.ru>
Date: Fri, 24 Jun 2005 20:52:52 +0000
Subject: [PATCH] fs/qnx4/*: fix sparse warnings

This patch fixes sparse warnings in the qnx4fs (and might even make
qnx4fs work on big-endian boxes)

Signed-off-by: Alexey Dobriyan <adobriyan@mail.ru>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: Anders Larsen <al@alarsen.net>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/qnx4/dir.c            |  2 +-
 fs/qnx4/inode.c          |  4 ++--
 include/linux/qnx4_fs.h  | 18 +++++++++---------
 include/linux/qnxtypes.h | 16 ++++++++--------
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index cd66147cca04..7a8f5595c26f 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -61,7 +61,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 						ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
 					else {
 						le  = (struct qnx4_link_info*)de;
-						ino = ( le->dl_inode_blk - 1 ) *
+						ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) *
 							QNX4_INODES_PER_BLOCK +
 							le->dl_inode_ndx;
 					}
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index aa92d6b76a9a..b79162a35478 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -236,7 +236,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
 	struct buffer_head *bh = NULL;
 	struct qnx4_xblk *xblk = NULL;
 	struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode);
-	qnx4_nxtnt_t nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts);
+	u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts);
 
 	if ( iblock < le32_to_cpu(qnx4_inode->di_first_xtnt.xtnt_size) ) {
 		// iblock is in the first extent. This is easy.
@@ -372,7 +372,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 		printk("qnx4: unable to read the superblock\n");
 		goto outnobh;
 	}
-	if ( le32_to_cpu( *(__u32*)bh->b_data ) != QNX4_SUPER_MAGIC ) {
+	if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
 		if (!silent)
 			printk("qnx4: wrong fsid in superblock.\n");
 		goto out;
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 22ba580b0ae8..fc610bb0f733 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -46,11 +46,11 @@ struct qnx4_inode_entry {
 	char		di_fname[QNX4_SHORT_NAME_MAX];
 	qnx4_off_t	di_size;
 	qnx4_xtnt_t	di_first_xtnt;
-	__u32		di_xblk;
-	__s32		di_ftime;
-	__s32		di_mtime;
-	__s32		di_atime;
-	__s32		di_ctime;
+	__le32		di_xblk;
+	__le32		di_ftime;
+	__le32		di_mtime;
+	__le32		di_atime;
+	__le32		di_ctime;
 	qnx4_nxtnt_t	di_num_xtnts;
 	qnx4_mode_t	di_mode;
 	qnx4_muid_t	di_uid;
@@ -63,18 +63,18 @@ struct qnx4_inode_entry {
 
 struct qnx4_link_info {
 	char		dl_fname[QNX4_NAME_MAX];
-	__u32		dl_inode_blk;
+	__le32		dl_inode_blk;
 	__u8		dl_inode_ndx;
 	__u8		dl_spare[10];
 	__u8		dl_status;
 };
 
 struct qnx4_xblk {
-	__u32		xblk_next_xblk;
-	__u32		xblk_prev_xblk;
+	__le32		xblk_next_xblk;
+	__le32		xblk_prev_xblk;
 	__u8		xblk_num_xtnts;
 	__u8		xblk_spare[3];
-	__s32		xblk_num_blocks;
+	__le32		xblk_num_blocks;
 	qnx4_xtnt_t	xblk_xtnts[QNX4_MAX_XTNTS_PER_XBLK];
 	char		xblk_signature[8];
 	qnx4_xtnt_t	xblk_first_xtnt;
diff --git a/include/linux/qnxtypes.h b/include/linux/qnxtypes.h
index fb518e318c7c..a3eb1137857b 100644
--- a/include/linux/qnxtypes.h
+++ b/include/linux/qnxtypes.h
@@ -12,18 +12,18 @@
 #ifndef _QNX4TYPES_H
 #define _QNX4TYPES_H
 
-typedef __u16 qnx4_nxtnt_t;
+typedef __le16 qnx4_nxtnt_t;
 typedef __u8  qnx4_ftype_t;
 
 typedef struct {
-	__u32 xtnt_blk;
-	__u32 xtnt_size;
+	__le32 xtnt_blk;
+	__le32 xtnt_size;
 } qnx4_xtnt_t;
 
-typedef __u16 qnx4_mode_t;
-typedef __u16 qnx4_muid_t;
-typedef __u16 qnx4_mgid_t;
-typedef __u32 qnx4_off_t;
-typedef __u16 qnx4_nlink_t;
+typedef __le16 qnx4_mode_t;
+typedef __le16 qnx4_muid_t;
+typedef __le16 qnx4_mgid_t;
+typedef __le32 qnx4_off_t;
+typedef __le16 qnx4_nlink_t;
 
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From e70c9d5e61c6cb2272c866fc1303e62975006752 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Sat, 25 Jun 2005 14:54:25 -0700
Subject: [PATCH] I8K: use standard DMI interface

I8K: Change to use stock dmi infrastructure instead of homegrown
     parsing code. The driver now requires box's DMI data to match
     list of supported models so driver can be safely compiled-in
     by default without fear of it poking into random SMM BIOS
     code. DMI checks can be ignored with i8k.ignore_dmi option.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |   3 +
 arch/i386/kernel/dmi_scan.c         |   6 +-
 drivers/char/i8k.c                  | 302 +++++++-----------------------------
 include/linux/dmi.h                 |   1 +
 4 files changed, 60 insertions(+), 252 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4924d387a657..86db43fd6b0f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -548,6 +548,9 @@ running once the system is up.
 
 	i810=		[HW,DRM]
 
+	i8k.ignore_dmi	[HW] Continue probing hardware even if DMI data
+			indicates that the driver is running on unsupported
+			hardware.
 	i8k.force	[HW] Activate i8k driver even if SMM BIOS signature
 			does not match list of supported models.
 	i8k.power_status
diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index 6ed7e28f306c..3facd20212bb 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -414,6 +414,7 @@ static void __init dmi_decode(struct dmi_header *dm)
 			dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 			dmi_printk(("Serial Number: %s\n",
 				dmi_string(dm, data[7])));
+			dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 			break;
 		case 2:
 			dmi_printk(("Board Vendor: %s\n",
@@ -470,7 +471,6 @@ fail:		d++;
 
 	return count;
 }
-
 EXPORT_SYMBOL(dmi_check_system);
 
 /**
@@ -480,8 +480,8 @@ EXPORT_SYMBOL(dmi_check_system);
  *	Returns one DMI data value, can be used to perform
  *	complex DMI data checks.
  */
-char * dmi_get_system_info(int field)
+char *dmi_get_system_info(int field)
 {
 	return dmi_ident[field];
 }
-
+EXPORT_SYMBOL(dmi_get_system_info);
diff --git a/drivers/char/i8k.c b/drivers/char/i8k.c
index bf5e43beca62..81d2f675fb77 100644
--- a/drivers/char/i8k.c
+++ b/drivers/char/i8k.c
@@ -20,7 +20,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
-#include <linux/apm_bios.h>
+#include <linux/dmi.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -52,18 +52,7 @@
 
 #define I8K_TEMPERATURE_BUG	1
 
-#define DELL_SIGNATURE		"Dell Computer"
-
-static char *supported_models[] = {
-	"Inspiron",
-	"Latitude",
-	NULL
-};
-
-static char system_vendor[48] = "?";
-static char product_name[48] = "?";
-static char bios_version[4] = "?";
-static char serial_number[16] = "?";
+static char bios_version[4];
 
 MODULE_AUTHOR("Massimo Dal Zotto (dz@debian.org)");
 MODULE_DESCRIPTION("Driver for accessing SMM BIOS on Dell laptops");
@@ -73,6 +62,10 @@ static int force;
 module_param(force, bool, 0);
 MODULE_PARM_DESC(force, "Force loading without checking for supported models");
 
+static int ignore_dmi;
+module_param(ignore_dmi, bool, 0);
+MODULE_PARM_DESC(ignore_dmi, "Continue probing hardware even if DMI data does not match");
+
 static int restricted;
 module_param(restricted, bool, 0);
 MODULE_PARM_DESC(restricted, "Allow fan control if SYS_ADMIN capability set");
@@ -99,11 +92,10 @@ typedef struct {
 	unsigned int edi __attribute__ ((packed));
 } SMMRegisters;
 
-typedef struct {
-	u8 type;
-	u8 length;
-	u16 handle;
-} DMIHeader;
+static inline char *i8k_get_dmi_data(int field)
+{
+	return dmi_get_system_info(field) ? : "N/A";
+}
 
 /*
  * Call the System Management Mode BIOS. Code provided by Jonathan Buzzard.
@@ -162,15 +154,6 @@ static int i8k_get_bios_version(void)
 	return regs.eax;
 }
 
-/*
- * Read the machine id.
- */
-static int i8k_get_serial_number(unsigned char *buff)
-{
-	strlcpy(buff, serial_number, sizeof(serial_number));
-	return 0;
-}
-
 /*
  * Read the Fn key status.
  */
@@ -328,7 +311,7 @@ static int i8k_get_dell_signature(void)
 static int i8k_ioctl(struct inode *ip, struct file *fp, unsigned int cmd,
 		     unsigned long arg)
 {
-	int val;
+	int val = 0;
 	int speed;
 	unsigned char buff[16];
 	int __user *argp = (int __user *)arg;
@@ -343,7 +326,7 @@ static int i8k_ioctl(struct inode *ip, struct file *fp, unsigned int cmd,
 
 	case I8K_MACHINE_ID:
 		memset(buff, 0, 16);
-		val = i8k_get_serial_number(buff);
+		strlcpy(buff, i8k_get_dmi_data(DMI_PRODUCT_SERIAL), sizeof(buff));
 		break;
 
 	case I8K_FN_STATUS:
@@ -451,10 +434,10 @@ static int i8k_get_info(char *buffer, char **start, off_t fpos, int length)
 	n = sprintf(buffer, "%s %s %s %d %d %d %d %d %d %d\n",
 		    I8K_PROC_FMT,
 		    bios_version,
-		    serial_number,
+		    dmi_get_system_info(DMI_PRODUCT_SERIAL) ? : "N/A",
 		    cpu_temp,
-		    left_fan,
-		    right_fan, left_speed, right_speed, ac_power, fn_key);
+		    left_fan, right_fan, left_speed, right_speed,
+		    ac_power, fn_key);
 
 	return n;
 }
@@ -486,201 +469,23 @@ static ssize_t i8k_read(struct file *f, char __user * buffer, size_t len,
 	return len;
 }
 
-static char *__init string_trim(char *s, int size)
-{
-	int len;
-	char *p;
-
-	if ((len = strlen(s)) > size) {
-		len = size;
-	}
-
-	for (p = s + len - 1; len && (*p == ' '); len--, p--) {
-		*p = '\0';
-	}
-
-	return s;
-}
-
-/* DMI code, stolen from arch/i386/kernel/dmi_scan.c */
-
-/*
- * |<-- dmi->length -->|
- * |                   |
- * |dmi header    s=N  | string1,\0, ..., stringN,\0, ..., \0
- *                |                       |
- *                +-----------------------+
- */
-static char *__init dmi_string(DMIHeader * dmi, u8 s)
-{
-	u8 *p;
-
-	if (!s) {
-		return "";
-	}
-	s--;
-
-	p = (u8 *) dmi + dmi->length;
-	while (s > 0) {
-		p += strlen(p);
-		p++;
-		s--;
-	}
-
-	return p;
-}
-
-static void __init dmi_decode(DMIHeader * dmi)
-{
-	u8 *data = (u8 *) dmi;
-	char *p;
-
-#ifdef I8K_DEBUG
-	int i;
-	printk("%08x ", (int)data);
-	for (i = 0; i < data[1] && i < 64; i++) {
-		printk("%02x ", data[i]);
-	}
-	printk("\n");
-#endif
-
-	switch (dmi->type) {
-	case 0:		/* BIOS Information */
-		p = dmi_string(dmi, data[5]);
-		if (*p) {
-			strlcpy(bios_version, p, sizeof(bios_version));
-			string_trim(bios_version, sizeof(bios_version));
-		}
-		break;
-	case 1:		/* System Information */
-		p = dmi_string(dmi, data[4]);
-		if (*p) {
-			strlcpy(system_vendor, p, sizeof(system_vendor));
-			string_trim(system_vendor, sizeof(system_vendor));
-		}
-		p = dmi_string(dmi, data[5]);
-		if (*p) {
-			strlcpy(product_name, p, sizeof(product_name));
-			string_trim(product_name, sizeof(product_name));
-		}
-		p = dmi_string(dmi, data[7]);
-		if (*p) {
-			strlcpy(serial_number, p, sizeof(serial_number));
-			string_trim(serial_number, sizeof(serial_number));
-		}
-		break;
-	}
-}
-
-static int __init dmi_table(u32 base, int len, int num,
-			    void (*fn) (DMIHeader *))
-{
-	u8 *buf;
-	u8 *data;
-	DMIHeader *dmi;
-	int i = 1;
-
-	buf = ioremap(base, len);
-	if (buf == NULL) {
-		return -1;
-	}
-	data = buf;
-
-	/*
-	 * Stop when we see al the items the table claimed to have
-	 * or we run off the end of the table (also happens)
-	 */
-	while ((i < num) && ((data - buf) < len)) {
-		dmi = (DMIHeader *) data;
-		/*
-		 * Avoid misparsing crud if the length of the last
-		 * record is crap
-		 */
-		if ((data - buf + dmi->length) >= len) {
-			break;
-		}
-		fn(dmi);
-		data += dmi->length;
-		/*
-		 * Don't go off the end of the data if there is
-		 * stuff looking like string fill past the end
-		 */
-		while (((data - buf) < len) && (*data || data[1])) {
-			data++;
-		}
-		data += 2;
-		i++;
-	}
-	iounmap(buf);
-
-	return 0;
-}
-
-static int __init dmi_iterate(void (*decode) (DMIHeader *))
-{
-	unsigned char buf[20];
-	void __iomem *p = ioremap(0xe0000, 0x20000), *q;
-
-	if (!p)
-		return -1;
-
-	for (q = p; q < p + 0x20000; q += 16) {
-		memcpy_fromio(buf, q, 20);
-		if (memcmp(buf, "_DMI_", 5) == 0) {
-			u16 num  = buf[13] << 8 | buf[12];
-			u16 len  = buf[7] << 8 | buf[6];
-			u32 base = buf[11] << 24 | buf[10] << 16 | buf[9] << 8 | buf[8];
-#ifdef I8K_DEBUG
-			printk(KERN_INFO "DMI %d.%d present.\n",
-			       buf[14] >> 4, buf[14] & 0x0F);
-			printk(KERN_INFO "%d structures occupying %d bytes.\n",
-			       buf[13] << 8 | buf[12], buf[7] << 8 | buf[6]);
-			printk(KERN_INFO "DMI table at 0x%08X.\n",
-			       buf[11] << 24 | buf[10] << 16 | buf[9] << 8 |
-			       buf[8]);
-#endif
-			if (dmi_table(base, len, num, decode) == 0) {
-				iounmap(p);
-				return 0;
-			}
-		}
-	}
-	iounmap(p);
-	return -1;
-}
-
-/* end of DMI code */
-
-/*
- * Get DMI information.
- */
-static int __init i8k_dmi_probe(void)
-{
-	char **p;
-
-	if (dmi_iterate(dmi_decode) != 0) {
-		printk(KERN_INFO "i8k: unable to get DMI information\n");
-		return -ENODEV;
-	}
-
-	if (strncmp(system_vendor, DELL_SIGNATURE, strlen(DELL_SIGNATURE)) != 0) {
-		printk(KERN_INFO "i8k: not running on a Dell system\n");
-		return -ENODEV;
-	}
-
-	for (p = supported_models;; p++) {
-		if (!*p) {
-			printk(KERN_INFO "i8k: unsupported model: %s\n",
-			       product_name);
-			return -ENODEV;
-		}
-		if (strncmp(product_name, *p, strlen(*p)) == 0) {
-			break;
-		}
-	}
-
-	return 0;
-}
+static struct dmi_system_id __initdata i8k_dmi_table[] = {
+	{
+		.ident = "Dell Inspiron",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron"),
+		},
+	},
+	{
+		.ident = "Dell Latitude",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Latitude"),
+		},
+	},
+	{ }
+};
 
 /*
  * Probe for the presence of a supported laptop.
@@ -689,23 +494,30 @@ static int __init i8k_probe(void)
 {
 	char buff[4];
 	int version;
-	int smm_found = 0;
 
 	/*
 	 * Get DMI information
 	 */
-	if (i8k_dmi_probe() != 0) {
+	if (!dmi_check_system(i8k_dmi_table)) {
+		if (!ignore_dmi && !force)
+			return -ENODEV;
+
+		printk(KERN_INFO "i8k: not running on a supported Dell system.\n");
 		printk(KERN_INFO "i8k: vendor=%s, model=%s, version=%s\n",
-		       system_vendor, product_name, bios_version);
+			i8k_get_dmi_data(DMI_SYS_VENDOR),
+			i8k_get_dmi_data(DMI_PRODUCT_NAME),
+			i8k_get_dmi_data(DMI_BIOS_VERSION));
 	}
 
+	strlcpy(bios_version, i8k_get_dmi_data(DMI_BIOS_VERSION), sizeof(bios_version));
+
 	/*
 	 * Get SMM Dell signature
 	 */
 	if (i8k_get_dell_signature() != 0) {
-		printk(KERN_INFO "i8k: unable to get SMM Dell signature\n");
-	} else {
-		smm_found = 1;
+		printk(KERN_ERR "i8k: unable to get SMM Dell signature\n");
+		if (!force)
+			return -ENODEV;
 	}
 
 	/*
@@ -713,9 +525,8 @@ static int __init i8k_probe(void)
 	 */
 	version = i8k_get_bios_version();
 	if (version <= 0) {
-		printk(KERN_INFO "i8k: unable to get SMM BIOS version\n");
+		printk(KERN_WARNING "i8k: unable to get SMM BIOS version\n");
 	} else {
-		smm_found = 1;
 		buff[0] = (version >> 16) & 0xff;
 		buff[1] = (version >> 8) & 0xff;
 		buff[2] = (version) & 0xff;
@@ -723,21 +534,15 @@ static int __init i8k_probe(void)
 		/*
 		 * If DMI BIOS version is unknown use SMM BIOS version.
 		 */
-		if (bios_version[0] == '?') {
-			strcpy(bios_version, buff);
-		}
+		if (!dmi_get_system_info(DMI_BIOS_VERSION))
+			strlcpy(bios_version, buff, sizeof(bios_version));
+
 		/*
 		 * Check if the two versions match.
 		 */
-		if (strncmp(buff, bios_version, sizeof(bios_version)) != 0) {
-			printk(KERN_INFO
-			       "i8k: BIOS version mismatch: %s != %s\n", buff,
-			       bios_version);
-		}
-	}
-
-	if (!smm_found && !force) {
-		return -ENODEV;
+		if (strncmp(buff, bios_version, sizeof(bios_version)) != 0)
+			printk(KERN_WARNING "i8k: BIOS version mismatch: %s != %s\n",
+				buff, bios_version);
 	}
 
 	return 0;
@@ -751,9 +556,8 @@ int __init i8k_init(void)
 	struct proc_dir_entry *proc_i8k;
 
 	/* Are we running on an supported laptop? */
-	if (i8k_probe() != 0) {
+	if (i8k_probe())
 		return -ENODEV;
-	}
 
 	/* Register the proc entry */
 	proc_i8k = create_proc_info_entry("i8k", 0, NULL, i8k_get_info);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index d2bcf556088b..5e93e6dce9a4 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -9,6 +9,7 @@ enum dmi_field {
 	DMI_SYS_VENDOR,
 	DMI_PRODUCT_NAME,
 	DMI_PRODUCT_VERSION,
+	DMI_PRODUCT_SERIAL,
 	DMI_BOARD_VENDOR,
 	DMI_BOARD_NAME,
 	DMI_BOARD_VERSION,
-- 
cgit v1.2.3-59-g8ed1b


From 52a119feaad92d44a0e97d01b22afbcbaf3fc079 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:54:57 -0700
Subject: [PATCH] make smp_prepare_cpu to a weak function

I really wish smp_prepare_cpu() would disappear eventually.  In the interim
this is ideally a weak function, so we dont end up changing several places
to define this dummy in headers.

Today since the dummy declaration is done only in drivers/base/cpu.c but
the function is called in kernel/power/smp.c i get undefined reference in
my cpu hotplug code for x86_64 under development.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/base/cpu.c     | 9 +++++----
 include/asm-i386/smp.h | 3 ---
 include/linux/cpu.h    | 1 +
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index bdd7e9f55c81..0bf2dc11cdb8 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -16,9 +16,10 @@ struct sysdev_class cpu_sysdev_class = {
 EXPORT_SYMBOL(cpu_sysdev_class);
 
 #ifdef CONFIG_HOTPLUG_CPU
-#ifndef __HAVE_ARCH_SMP_PREPARE_CPU
-#define smp_prepare_cpu(cpu) (0)
-#endif
+int __attribute__((weak)) smp_prepare_cpu (int cpu)
+{
+	return 0;
+}
 
 static ssize_t show_online(struct sys_device *dev, char *buf)
 {
@@ -41,7 +42,7 @@ static ssize_t store_online(struct sys_device *dev, const char *buf,
 		break;
 	case '1':
 		ret = smp_prepare_cpu(cpu->sysdev.id);
-		if (ret == 0)
+		if (!ret)
 			ret = cpu_up(cpu->sysdev.id);
 		break;
 	default:
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index c9996eda5408..edad9b4712fa 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -51,9 +51,6 @@ extern u8 x86_cpu_to_apicid[];
 #ifdef CONFIG_HOTPLUG_CPU
 extern void cpu_exit_clear(void);
 extern void cpu_uninit(void);
-
-#define __HAVE_ARCH_SMP_PREPARE_CPU
-extern int smp_prepare_cpu(int cpu);
 #endif
 
 /*
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index fe0298e5dae1..e8904c0da686 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,6 +69,7 @@ extern struct semaphore cpucontrol;
 	register_cpu_notifier(&fn##_nb);			\
 }
 int cpu_down(unsigned int cpu);
+extern int __attribute__((weak)) smp_prepare_cpu(int cpu);
 #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
 #else
 #define lock_cpu_hotplug()	do { } while (0)
-- 
cgit v1.2.3-59-g8ed1b


From e6982c671c560da4a0bc5c908cbcbec12bd5991d Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sat, 25 Jun 2005 14:54:58 -0700
Subject: [PATCH] x86_64: Change init sections for CPU hotplug support

This patch adds __cpuinit and __cpuinitdata sections that need to exist past
boot to support cpu hotplug.

Caveat: This is done *only* for EM64T CPU Hotplug support, on request from
Andi Kleen.  Much of the generic hotplug code in kernel, and none of the other
archs that support CPU hotplug today, i386, ia64, ppc64, s390 and parisc dont
mark sections with __cpuinit, but only mark them as __devinit, and
__devinitdata.

If someone is motivated to change generic code, we need to make sure all
existing hotplug code does not break, on other arch's that dont use __cpuinit,
and __cpudevinit.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Andi Kleen <ak@muc.de>
Acked-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/apic.c      |  8 ++++----
 arch/x86_64/kernel/i387.c      |  2 +-
 arch/x86_64/kernel/mce.c       |  8 ++++----
 arch/x86_64/kernel/mce_intel.c |  4 ++--
 arch/x86_64/kernel/nmi.c       |  4 ++--
 arch/x86_64/kernel/process.c   |  2 +-
 arch/x86_64/kernel/setup.c     | 18 +++++++++---------
 arch/x86_64/kernel/setup64.c   |  6 +++---
 arch/x86_64/kernel/smpboot.c   | 15 +++++----------
 arch/x86_64/mm/numa.c          |  2 +-
 include/linux/init.h           | 12 ++++++++++++
 11 files changed, 44 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd4..2a6c893ccf60 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -285,7 +285,7 @@ void __init init_bsp_APIC(void)
 	apic_write_around(APIC_LVT1, value);
 }
 
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
 {
 	unsigned int value, ver, maxlvt;
 
@@ -534,7 +534,7 @@ static struct sys_device device_lapic = {
 	.cls		= &lapic_sysclass,
 };
 
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
 }
@@ -774,14 +774,14 @@ void __init setup_boot_APIC_clock (void)
 	local_irq_enable();
 }
 
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
 {
 	local_irq_disable(); /* FIXME: Do we need this? --RR */
 	setup_APIC_timer(calibration_result);
 	local_irq_enable();
 }
 
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
 {
 	if (using_apic_timer) {
 		unsigned long v;
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57ce..d9b22b633e39 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
  * Called at bootup to set up the initial FPU state that is later cloned
  * into all processes.
  */
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
 {
 	unsigned long oldcr0 = read_cr0();
 	extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f6..7ab15c8ab95f 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
 }
 
 /* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c) 
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
 { 
 	/* This should be disabled by the BIOS, but isn't always */
 	if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
 	}
 }			
 
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
 {
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off. 
  */
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
 {
 	static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
 
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(tolerant,tolerant,)
 ACCESSOR(check_interval,check_interval,mce_restart())
 
-static __init int mce_init_device(void)
+static __cpuinit int mce_init_device(void)
 {
 	int err;
 	if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069f..0be0a7959814 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
 	irq_exit();
 }
 
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
 	return;
 }
 
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 }
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac91..4e44d6e6b7e5 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
 	(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT|	\
 	 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
 
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
 }
 
 /* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
 {
 	if (nmi_watchdog != NMI_DEFAULT)
 		return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e59d1f9d6163..15fa35a81a72 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -204,7 +204,7 @@ static void mwait_idle(void)
 	}
 }
 
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 {
 	static int printed;
 	if (cpu_has(c, X86_FEATURE_MWAIT)) {
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 000015dd5a8b..ca9a915a6d4a 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -676,7 +676,7 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 
@@ -692,7 +692,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
 }
 
 
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
 	unsigned int n, dummy, eax, ebx, ecx, edx;
 
@@ -803,7 +803,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
 	return r;
 }
 
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
 	u32 	eax, ebx, ecx, edx;
@@ -864,7 +864,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
 /*
  * find out the number of processor cores on the die
  */
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 {
 	unsigned int eax;
 
@@ -882,7 +882,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
 		return 1;
 }
 
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	/* Cache sizes */
 	unsigned n;
@@ -902,7 +902,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
  	c->x86_num_cores = intel_num_cpu_cores(c);
 }
 
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
 
@@ -923,7 +923,7 @@ struct cpu_model_info {
 /* Do some early cpuid on the boot CPU to get some parameter that are
    needed before check_bugs. Everything advanced is in identify_cpu
    below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 {
 	u32 tfms;
 
@@ -977,7 +977,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 {
 	int i;
 	u32 xlvl;
@@ -1054,7 +1054,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
 }
  
 
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 {
 	if (c->x86_model_id[0])
 		printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8b..f2b9c6bc999a 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
 
 char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
 
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
 
@@ -171,7 +171,7 @@ void syscall_init(void)
 	wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); 
 }
 
-void __init check_efer(void)
+void __cpuinit check_efer(void)
 {
 	unsigned long efer;
 
@@ -188,7 +188,7 @@ void __init check_efer(void)
  * 'CPU state barrier', nothing should get across.
  * A lot of state is already set up in PDA init.
  */
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
 {
 #ifdef CONFIG_SMP
 	int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f345941..bc98a6722cba 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -58,11 +58,6 @@
 #include <asm/proto.h>
 #include <asm/nmi.h>
 
-/* Change for real CPU hotplug. Note other files need to be fixed
-   first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
@@ -823,7 +818,7 @@ static __cpuinit void smp_cleanup_boot(void)
  *
  * RED-PEN audit/test this more. I bet there is more state messed up here.
  */
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
 {
 	cpu_present_map = cpumask_of_cpu(0);
 	cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +833,7 @@ static __cpuinit void disable_smp(void)
 /*
  * Handle user cpus=... parameter.
  */
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
 {
 	int i, k;
 	k = 0;
@@ -855,7 +850,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
 /*
  * Various sanity checks.
  */
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
 {
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +908,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
  * Prepare for SMP bootup.  The MP table or ACPI has been read
  * earlier.  Just do some sanity checking here and enable APIC mode.
  */
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
 {
 	int i;
 
@@ -1019,7 +1014,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 /*
  * Finish the SMP boot.
  */
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
 {
 	zap_low_mappings();
 	smp_cleanup_boot();
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 84cde796ecb1..ac61c186eb02 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
 }
 
-__init void numa_add_cpu(int cpu)
+__cpuinit void numa_add_cpu(int cpu)
 {
 	/* BP is initialized elsewhere */
 	if (cpu) 
diff --git a/include/linux/init.h b/include/linux/init.h
index 05c83e0521ca..59008c3826cf 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -229,6 +229,18 @@ void __init parse_early_param(void);
 #define __devexitdata __exitdata
 #endif
 
+#ifdef CONFIG_HOTPLUG_CPU
+#define __cpuinit
+#define __cpuinitdata
+#define __cpuexit
+#define __cpuexitdata
+#else
+#define __cpuinit	__init
+#define __cpuinitdata __initdata
+#define __cpuexit __exit
+#define __cpuexitdata	__exitdata
+#endif
+
 /* Functions marked as __devexit may be discarded at kernel link time, depending
    on config options.  Newer versions of binutils detect references from
    retained sections to discarded sections and flag an error.  Pointers to
-- 
cgit v1.2.3-59-g8ed1b


From 5a72e04df5470df0ec646029d31e5528167ab1a7 Mon Sep 17 00:00:00 2001
From: Li Shaohua <shaohua.li@intel.com>
Date: Sat, 25 Jun 2005 14:55:06 -0700
Subject: [PATCH] suspend/resume SMP support

Using CPU hotplug to support suspend/resume SMP.  Both S3 and S4 use
disable/enable_nonboot_cpus API.  The S4 part is based on Pavel's original S4
SMP patch.

Signed-off-by: Li Shaohua<shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mcheck/k7.c      |  2 +-
 arch/i386/kernel/cpu/mcheck/mce.c     |  2 +-
 arch/i386/kernel/cpu/mcheck/p4.c      |  4 +-
 arch/i386/kernel/cpu/mcheck/p6.c      |  2 +-
 arch/i386/kernel/cpu/mcheck/winchip.c |  2 +-
 drivers/acpi/Kconfig                  |  2 +-
 include/linux/suspend.h               |  2 +-
 kernel/power/Kconfig                  |  6 ++-
 kernel/power/Makefile                 |  6 +--
 kernel/power/disk.c                   | 35 +++++++-------
 kernel/power/main.c                   | 16 ++++---
 kernel/power/smp.c                    | 89 +++++++++++++----------------------
 kernel/power/swsusp.c                 |  2 +
 13 files changed, 80 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c
index 8df52e86c4d2..c4abe7657397 100644
--- a/arch/i386/kernel/cpu/mcheck/k7.c
+++ b/arch/i386/kernel/cpu/mcheck/k7.c
@@ -69,7 +69,7 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code)
 
 
 /* AMD K7 machine check is Intel like */
-void __init amd_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit amd_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c
index 7218a7341fbc..2cf25d2ba0f1 100644
--- a/arch/i386/kernel/cpu/mcheck/mce.c
+++ b/arch/i386/kernel/cpu/mcheck/mce.c
@@ -16,7 +16,7 @@
 
 #include "mce.h"
 
-int mce_disabled __initdata = 0;
+int mce_disabled __devinitdata = 0;
 int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c
index 8b16ceb929b4..0abccb6fdf9e 100644
--- a/arch/i386/kernel/cpu/mcheck/p4.c
+++ b/arch/i386/kernel/cpu/mcheck/p4.c
@@ -78,7 +78,7 @@ fastcall void smp_thermal_interrupt(struct pt_regs *regs)
 }
 
 /* P4/Xeon Thermal regulation detect and init */
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __devinit intel_init_thermal(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	unsigned int cpu = smp_processor_id();
@@ -232,7 +232,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 }
 
 
-void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c
index 46640f8c2494..f01b73f947e1 100644
--- a/arch/i386/kernel/cpu/mcheck/p6.c
+++ b/arch/i386/kernel/cpu/mcheck/p6.c
@@ -80,7 +80,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code)
 }
 
 /* Set up machine check reporting for processors with Intel style MCE */
-void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit intel_p6_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c
index 753fa7acb984..7bae68fa168f 100644
--- a/arch/i386/kernel/cpu/mcheck/winchip.c
+++ b/arch/i386/kernel/cpu/mcheck/winchip.c
@@ -23,7 +23,7 @@ static fastcall void winchip_machine_check(struct pt_regs * regs, long error_cod
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
-void __init winchip_mcheck_init(struct cpuinfo_x86 *c)
+void __devinit winchip_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 lo, hi;
 	machine_check_vector = winchip_machine_check;
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 670fdb5142d1..86c52520ed34 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -55,7 +55,7 @@ if ACPI_INTERPRETER
 
 config ACPI_SLEEP
 	bool "Sleep States (EXPERIMENTAL)"
-	depends on X86
+	depends on X86 && (!SMP || SUSPEND_SMP)
 	depends on EXPERIMENTAL && PM
 	default y
 	---help---
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 2bf0d5fabcdb..f2e96fdfaae0 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -58,7 +58,7 @@ static inline int software_suspend(void)
 }
 #endif
 
-#ifdef CONFIG_SMP
+#ifdef CONFIG_SUSPEND_SMP
 extern void disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
 #else
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..fdb377636505 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -28,7 +28,7 @@ config PM_DEBUG
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && PM && SWAP
+	depends on EXPERIMENTAL && PM && SWAP && (SUSPEND_SMP || !SMP)
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
 	  suspended image to. It will simply pick the first available swap 
 	  device.
 
+config SUSPEND_SMP
+	bool
+	depends on HOTPLUG_CPU && X86 && PM
+	default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-swsusp-smp-$(CONFIG_SMP)	+= smp.o
-
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o $(swsusp-smp-y) disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+
+obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
 obj-$(CONFIG_MAGIC_SYSRQ)	+= poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
 {
 	device_resume();
 	platform_finish();
-	enable_nonboot_cpus();
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -131,28 +131,35 @@ static int prepare_processes(void)
 
 	sys_sync();
 
+	disable_nonboot_cpus();
+
 	if (freeze_processes()) {
 		error = -EBUSY;
-		return error;
+		goto thaw;
 	}
 
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
 		if (pm_ops && pm_ops->prepare) {
 			if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
-				return error;
+				goto thaw;
 		}
 	}
 
 	/* Free memory before shutting down devices. */
 	free_some_memory();
-
 	return 0;
+thaw:
+	thaw_processes();
+	enable_nonboot_cpus();
+	pm_restore_console();
+	return error;
 }
 
 static void unprepare_processes(void)
 {
-	enable_nonboot_cpus();
+	platform_finish();
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -160,15 +167,9 @@ static int prepare_devices(void)
 {
 	int error;
 
-	disable_nonboot_cpus();
-	if ((error = device_suspend(PMSG_FREEZE))) {
+	if ((error = device_suspend(PMSG_FREEZE)))
 		printk("Some devices failed to suspend\n");
-		platform_finish();
-		enable_nonboot_cpus();
-		return error;
-	}
-
-	return 0;
+	return error;
 }
 
 /**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
 	int error;
 
 	error = prepare_processes();
-	if (!error) {
-		error = prepare_devices();
-	}
+	if (error)
+		return error;
+	error = prepare_devices();
 
 	if (error) {
 		unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
 
 	if ((error = prepare_processes())) {
 		swsusp_close();
-		goto Cleanup;
+		goto Done;
 	}
 
 	pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
 
 	pm_prepare_console();
 
+	disable_nonboot_cpus();
+
+	if (num_online_cpus() != 1) {
+		error = -EPERM;
+		goto Enable_cpu;
+	}
+
 	if (freeze_processes()) {
 		error = -EAGAIN;
 		goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
 		pm_ops->finish(state);
  Thaw:
 	thaw_processes();
+ Enable_cpu:
+	enable_nonboot_cpus();
 	pm_restore_console();
 	return error;
 }
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
 	if (pm_ops && pm_ops->finish)
 		pm_ops->finish(state);
 	thaw_processes();
+	enable_nonboot_cpus();
 	pm_restore_console();
 }
 
@@ -150,12 +160,6 @@ static int enter_state(suspend_state_t state)
 		goto Unlock;
 	}
 
-	/* Suspend is hard to get right on SMP. */
-	if (num_online_cpus() != 1) {
-		error = -EPERM;
-		goto Unlock;
-	}
-
 	pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
 	if ((error = suspend_prepare(state)))
 		goto Unlock;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index 457c2302ed42..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
 #include <linux/module.h>
+#include <linux/cpu.h>
 #include <asm/atomic.h>
 #include <asm/tlbflush.h>
 
-static atomic_t cpu_counter, freeze;
-
-
-static void smp_pause(void * data)
-{
-	struct saved_context ctxt;
-	__save_processor_state(&ctxt);
-	printk("Sleeping in:\n");
-	dump_stack();
-	atomic_inc(&cpu_counter);
-	while (atomic_read(&freeze)) {
-		/* FIXME: restore takes place at random piece inside this.
-		   This should probably be written in assembly, and
-		   preserve general-purpose registers, too
-
-		   What about stack? We may need to move to new stack here.
-
-		   This should better be ran with interrupts disabled.
-		 */
-		cpu_relax();
-		barrier();
-	}
-	atomic_dec(&cpu_counter);
-	__restore_processor_state(&ctxt);
-}
-
-static cpumask_t oldmask;
+/* This is protected by pm_sem semaphore */
+static cpumask_t frozen_cpus;
 
 void disable_nonboot_cpus(void)
 {
-	oldmask = current->cpus_allowed;
-	set_cpus_allowed(current, cpumask_of_cpu(0));
-	printk("Freezing CPUs (at %d)", raw_smp_processor_id());
-	current->state = TASK_INTERRUPTIBLE;
-	schedule_timeout(HZ);
-	printk("...");
-	BUG_ON(raw_smp_processor_id() != 0);
-
-	/* FIXME: for this to work, all the CPUs must be running
-	 * "idle" thread (or we deadlock). Is that guaranteed? */
+	int cpu, error;
 
-	atomic_set(&cpu_counter, 0);
-	atomic_set(&freeze, 1);
-	smp_call_function(smp_pause, NULL, 0, 0);
-	while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
-		cpu_relax();
-		barrier();
+	error = 0;
+	cpus_clear(frozen_cpus);
+	printk("Freezing cpus ...\n");
+	for_each_online_cpu(cpu) {
+		if (cpu == 0)
+			continue;
+		error = cpu_down(cpu);
+		if (!error) {
+			cpu_set(cpu, frozen_cpus);
+			printk("CPU%d is down\n", cpu);
+			continue;
+		}
+		printk("Error taking cpu %d down: %d\n", cpu, error);
 	}
-	printk("ok\n");
+	BUG_ON(smp_processor_id() != 0);
+	if (error)
+		panic("cpus not sleeping");
 }
 
 void enable_nonboot_cpus(void)
 {
-	printk("Restarting CPUs");
-	atomic_set(&freeze, 0);
-	while (atomic_read(&cpu_counter)) {
-		cpu_relax();
-		barrier();
-	}
-	printk("...");
-	set_cpus_allowed(current, oldmask);
-	schedule();
-	printk("ok\n");
+	int cpu, error;
 
+	printk("Thawing cpus ...\n");
+	for_each_cpu_mask(cpu, frozen_cpus) {
+		error = smp_prepare_cpu(cpu);
+		if (!error)
+			error = cpu_up(cpu);
+		if (!error) {
+			printk("CPU%d is up\n", cpu);
+			continue;
+		}
+		printk("Error taking cpu %d up: %d\n", cpu, error);
+		panic("Not enough cpus");
+	}
+	cpus_clear(frozen_cpus);
 }
 
-
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 53f9f8720ee4..339b5c3735bd 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1193,8 +1193,10 @@ static const char * sanity_check(void)
 		return "version";
 	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
 		return "machine";
+#if 0
 	if(swsusp_info.cpus != num_online_cpus())
 		return "number of cpus";
+#endif
 	return NULL;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 620b03276488c3cf103caf1e326bd21f00d3df84 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 25 Jun 2005 14:55:11 -0700
Subject: [PATCH] properly stop devices before poweroff

Without this patch, Linux provokes emergency disk shutdowns and
similar nastiness. It was in SuSE kernels for some time, IIRC.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pm.h | 33 +++++++++++++++++++++------------
 kernel/sys.c       |  3 +++
 2 files changed, 24 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index ed2b76e75199..14479325e3f3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -103,7 +103,8 @@ extern int pm_active;
 /*
  * Register a device with power management
  */
-struct pm_dev __deprecated *pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
+struct pm_dev __deprecated *
+pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
 
 /*
  * Unregister a device with power management
@@ -190,17 +191,18 @@ typedef u32 __bitwise pm_message_t;
 /*
  * There are 4 important states driver can be in:
  * ON     -- driver is working
- * FREEZE -- stop operations and apply whatever policy is applicable to a suspended driver
- *           of that class, freeze queues for block like IDE does, drop packets for
- *           ethernet, etc... stop DMA engine too etc... so a consistent image can be
- *           saved; but do not power any hardware down.
- * SUSPEND - like FREEZE, but hardware is doing as much powersaving as possible. Roughly
- *           pci D3.
+ * FREEZE -- stop operations and apply whatever policy is applicable to a
+ *           suspended driver of that class, freeze queues for block like IDE
+ *           does, drop packets for ethernet, etc... stop DMA engine too etc...
+ *           so a consistent image can be saved; but do not power any hardware
+ *           down.
+ * SUSPEND - like FREEZE, but hardware is doing as much powersaving as
+ *           possible. Roughly pci D3.
  *
- * Unfortunately, current drivers only recognize numeric values 0 (ON) and 3 (SUSPEND).
- * We'll need to fix the drivers. So yes, putting 3 to all diferent defines is intentional,
- * and will go away as soon as drivers are fixed. Also note that typedef is neccessary,
- * we'll probably want to switch to
+ * Unfortunately, current drivers only recognize numeric values 0 (ON) and 3
+ * (SUSPEND).  We'll need to fix the drivers. So yes, putting 3 to all different
+ * defines is intentional, and will go away as soon as drivers are fixed.  Also
+ * note that typedef is neccessary, we'll probably want to switch to
  *   typedef struct pm_message_t { int event; int flags; } pm_message_t
  * or something similar soon.
  */
@@ -222,11 +224,18 @@ struct dev_pm_info {
 
 extern void device_pm_set_parent(struct device * dev, struct device * parent);
 
-extern int device_suspend(pm_message_t state);
 extern int device_power_down(pm_message_t state);
 extern void device_power_up(void);
 extern void device_resume(void);
 
+#ifdef CONFIG_PM
+extern int device_suspend(pm_message_t state);
+#else
+static inline int device_suspend(pm_message_t state)
+{
+	return 0;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/sys.c b/kernel/sys.c
index da24bc1292db..dac10161ca23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -405,6 +405,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 		system_state = SYSTEM_HALT;
+		device_suspend(PMSG_SUSPEND);
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -415,6 +416,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 		system_state = SYSTEM_POWER_OFF;
+		device_suspend(PMSG_SUSPEND);
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -431,6 +433,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
 		system_state = SYSTEM_RESTART;
+		device_suspend(PMSG_FREEZE);
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
-- 
cgit v1.2.3-59-g8ed1b


From b2b18660066997420b716c1881a6be8b82700d97 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sat, 25 Jun 2005 14:55:38 -0700
Subject: [PATCH] RCU: clean up a few remaining synchronize_kernel() calls

2.6.12-rc6-mm1 has a few remaining synchronize_kernel()s, some (but not
all) in comments.  This patch changes these synchronize_kernel() calls (and
comments) to synchronize_rcu() or synchronize_sched() as follows:

- arch/x86_64/kernel/mce.c mce_read(): change to synchronize_sched() to
  handle races with machine-check exceptions (synchronize_rcu() would not cut
  it given RCU implementations intended for hardcore realtime use.

- drivers/input/serio/i8042.c i8042_stop(): change to synchronize_sched() to
  handle races with i8042_interrupt() interrupt handler.  Again,
  synchronize_rcu() would not cut it given RCU implementations intended for
  hardcore realtime use.

- include/*/kdebug.h comments: change to synchronize_sched() to handle races
  with NMIs.  As before, synchronize_rcu() would not cut it...

- include/linux/list.h comment: change to synchronize_rcu(), since this
  comment is for list_del_rcu().

- security/keys/key.c unregister_key_type(): change to synchronize_rcu(),
  since this is interacting with RCU read side.

- security/keys/process_keys.c install_session_keyring(): change to
  synchronize_rcu(), since this is interacting with RCU read side.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/mce.c     | 2 +-
 drivers/input/serio/i8042.c  | 2 +-
 include/asm-i386/kdebug.h    | 2 +-
 include/asm-ppc64/kdebug.h   | 2 +-
 include/asm-sparc64/kdebug.h | 2 +-
 include/asm-x86_64/kdebug.h  | 2 +-
 include/linux/list.h         | 2 +-
 security/keys/key.c          | 2 +-
 security/keys/process_keys.c | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 7ab15c8ab95f..21e70625a495 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
 	memset(mcelog.entry, 0, next * sizeof(struct mce));
 	mcelog.next = 0;
 
-	synchronize_kernel();	
+	synchronize_sched();
 
 	/* Collect entries that were still getting written before the synchronize. */
 
diff --git a/drivers/input/serio/i8042.c b/drivers/input/serio/i8042.c
index 5900de3c3f4f..a9bf549c8dc5 100644
--- a/drivers/input/serio/i8042.c
+++ b/drivers/input/serio/i8042.c
@@ -396,7 +396,7 @@ static void i8042_stop(struct serio *serio)
 	struct i8042_port *port = serio->port_data;
 
 	port->exists = 0;
-	synchronize_kernel();
+	synchronize_sched();
 	port->serio = NULL;
 }
 
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index de6498b0d493..b3f8d5f59d5d 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -18,7 +18,7 @@ struct die_args {
 };
 
 /* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel - then free.
+   If you really want to do it first unregister - then synchronize_sched - then free.
   */
 int register_die_notifier(struct notifier_block *nb);
 extern struct notifier_block *i386die_chain;
diff --git a/include/asm-ppc64/kdebug.h b/include/asm-ppc64/kdebug.h
index 488634258a72..d383d161cf8d 100644
--- a/include/asm-ppc64/kdebug.h
+++ b/include/asm-ppc64/kdebug.h
@@ -17,7 +17,7 @@ struct die_args {
 
 /*
    Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel -
+   If you really want to do it first unregister - then synchronize_sched -
    then free.
  */
 int register_die_notifier(struct notifier_block *nb);
diff --git a/include/asm-sparc64/kdebug.h b/include/asm-sparc64/kdebug.h
index f70d3dad01f9..6321f5a0198d 100644
--- a/include/asm-sparc64/kdebug.h
+++ b/include/asm-sparc64/kdebug.h
@@ -16,7 +16,7 @@ struct die_args {
 };
 
 /* Note - you should never unregister because that can race with NMIs.
- * If you really want to do it first unregister - then synchronize_kernel
+ * If you really want to do it first unregister - then synchronize_sched
  * - then free.
  */
 int register_die_notifier(struct notifier_block *nb);
diff --git a/include/asm-x86_64/kdebug.h b/include/asm-x86_64/kdebug.h
index 6277f75cbb4b..b90341994d80 100644
--- a/include/asm-x86_64/kdebug.h
+++ b/include/asm-x86_64/kdebug.h
@@ -14,7 +14,7 @@ struct die_args {
 }; 
 
 /* Note - you should never unregister because that can race with NMIs.
-   If you really want to do it first unregister - then synchronize_kernel - then free. 
+   If you really want to do it first unregister - then synchronize_sched - then free.
   */
 int register_die_notifier(struct notifier_block *nb);
 extern struct notifier_block *die_chain;
diff --git a/include/linux/list.h b/include/linux/list.h
index 399b51d17218..aab2db21b013 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -185,7 +185,7 @@ static inline void list_del(struct list_head *entry)
  * list_for_each_entry_rcu().
  *
  * Note that the caller is not permitted to immediately free
- * the newly deleted entry.  Instead, either synchronize_kernel()
+ * the newly deleted entry.  Instead, either synchronize_rcu()
  * or call_rcu() must be used to defer freeing until an RCU
  * grace period has elapsed.
  */
diff --git a/security/keys/key.c b/security/keys/key.c
index 3304d37bb379..fb89f9844465 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -980,7 +980,7 @@ void unregister_key_type(struct key_type *ktype)
 	spin_unlock(&key_serial_lock);
 
 	/* make sure everyone revalidates their keys */
-	synchronize_kernel();
+	synchronize_rcu();
 
 	/* we should now be able to destroy the payloads of all the keys of
 	 * this type with impunity */
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 34db087bbcc7..9b0369c5a223 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -234,7 +234,7 @@ static int install_session_keyring(struct task_struct *tsk,
 	ret = 0;
 
 	/* we're using RCU on the pointer */
-	synchronize_kernel();
+	synchronize_rcu();
 	key_put(old);
  error:
 	return ret;
-- 
cgit v1.2.3-59-g8ed1b


From 7897986bad8f6cd50d6149345aca7f6480f49464 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:13 -0700
Subject: [PATCH] sched: balance timers

Do CPU load averaging over a number of different intervals.  Allow each
interval to be chosen by sending a parameter to source_load and target_load.
0 is instantaneous, idx > 0 returns a decaying average with the most recent
sample weighted at 2^(idx-1).  To a maximum of 3 (could be easily increased).

So generally a higher number will result in more conservative balancing.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |   4 ++
 include/asm-x86_64/topology.h |   6 +-
 include/linux/sched.h         |   4 ++
 include/linux/topology.h      |   8 +++
 kernel/sched.c                | 138 ++++++++++++++++++++++--------------------
 5 files changed, 95 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 6d0f67507b21..0055fbfeec7b 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -74,6 +74,10 @@ static inline int node_to_first_cpu(int node)
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 8f77e9f6bc23..fe8d80a15751 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -39,7 +39,11 @@ extern int __node_distance(int, int);
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 1, 			\
+	.wake_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b0444..664981ac1fb6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -488,6 +488,10 @@ struct sched_domain {
 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
+	unsigned int busy_idx;
+	unsigned int idle_idx;
+	unsigned int newidle_idx;
+	unsigned int wake_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d70e8972c67f..ae9c2216dfa6 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -89,6 +89,10 @@
 	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 25,			\
+	.busy_idx		= 0,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 0,			\
+	.wake_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -115,6 +119,10 @@
 	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
+	.busy_idx		= 2,			\
+	.idle_idx		= 0,			\
+	.newidle_idx		= 1,			\
+	.wake_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index f665de34ed82..b597b07e7911 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long cpu_load;
+	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
 
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return min(rq->cpu_load, load_now);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	if (type == 0)
+		return load_now;
 
-	return max(rq->cpu_load, load_now);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 #endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	runqueue_t *rq;
 #ifdef CONFIG_SMP
 	unsigned long load, this_load;
-	struct sched_domain *sd;
+	struct sched_domain *sd, *this_sd = NULL;
 	int new_cpu;
 #endif
 
@@ -986,72 +990,64 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+	new_cpu = cpu;
+
 	schedstat_inc(rq, ttwu_cnt);
 	if (cpu == this_cpu) {
 		schedstat_inc(rq, ttwu_local);
-	} else {
-		for_each_domain(this_cpu, sd) {
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_wake_remote);
-				break;
-			}
+		goto out_set_cpu;
+	}
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			schedstat_inc(sd, ttwu_wake_remote);
+			this_sd = sd;
+			break;
 		}
 	}
-#endif
 
-	new_cpu = cpu;
-	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
-	load = source_load(cpu);
-	this_load = target_load(this_cpu);
-
 	/*
-	 * If sync wakeup then subtract the (maximum possible) effect of
-	 * the currently running task from the load of the current CPU:
+	 * Check for affine wakeup and passive balancing possibilities.
 	 */
-	if (sync)
-		this_load -= SCHED_LOAD_SCALE;
-
-	/* Don't pull the task off an idle CPU to a busy one */
-	if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-		goto out_set_cpu;
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
 
-	new_cpu = this_cpu; /* Wake to this CPU if we can */
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
 
-	/*
-	 * Scan domains for affine wakeup and passive balancing
-	 * possibilities.
-	 */
-	for_each_domain(this_cpu, sd) {
-		unsigned int imbalance;
 		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
+		 * If sync wakeup then subtract the (maximum possible) effect of
+		 * the currently running task from the load of the current CPU:
 		 */
-		imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+		if (sync)
+			this_load -= SCHED_LOAD_SCALE;
+
+		 /* Don't pull the task off an idle CPU to a busy one */
+		if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
+			goto out_set_cpu;
 
-		if ((sd->flags & SD_WAKE_AFFINE) &&
-				!task_hot(p, rq->timestamp_last_tick, sd)) {
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if ((this_sd->flags & SD_WAKE_AFFINE) &&
+			!task_hot(p, rq->timestamp_last_tick, this_sd)) {
 			/*
 			 * This domain has SD_WAKE_AFFINE and p is cache cold
 			 * in this domain.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_affine);
-				goto out_set_cpu;
-			}
-		} else if ((sd->flags & SD_WAKE_BALANCE) &&
+			schedstat_inc(this_sd, ttwu_move_affine);
+			goto out_set_cpu;
+		} else if ((this_sd->flags & SD_WAKE_BALANCE) &&
 				imbalance*this_load <= 100*load) {
 			/*
 			 * This domain has SD_WAKE_BALANCE and there is
 			 * an imbalance.
 			 */
-			if (cpu_isset(cpu, sd->span)) {
-				schedstat_inc(sd, ttwu_move_balance);
-				goto out_set_cpu;
-			}
+			schedstat_inc(this_sd, ttwu_move_balance);
+			goto out_set_cpu;
 		}
 	}
 
@@ -1509,7 +1505,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	cpus_and(mask, sd->span, p->cpus_allowed);
 
 	for_each_cpu_mask(i, mask) {
-		load = target_load(i);
+		load = target_load(i, sd->wake_idx);
 
 		if (load < min_load) {
 			min_cpu = i;
@@ -1522,7 +1518,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	}
 
 	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
 
 	/*
 	 * Would with the addition of the new task to the
@@ -1767,8 +1763,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
 	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
 	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	int load_idx;
 
 	max_load = this_load = total_load = total_pwr = 0;
+	if (idle == NOT_IDLE)
+		load_idx = sd->busy_idx;
+	else if (idle == NEWLY_IDLE)
+		load_idx = sd->newidle_idx;
+	else
+		load_idx = sd->idle_idx;
 
 	do {
 		unsigned long load;
@@ -1783,9 +1786,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		for_each_cpu_mask(i, group->cpumask) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i);
+				load = target_load(i, load_idx);
 			else
-				load = source_load(i);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -1895,7 +1898,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2150,18 +2153,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *sd;
+	int i;
 
-	/* Update our load */
-	old_load = this_rq->cpu_load;
 	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-	/*
-	 * Round up the averaging division if load is increasing. This
-	 * prevents us from getting stuck on 9 if the load is 10, for
-	 * example.
-	 */
-	if (this_load > old_load)
-		old_load++;
-	this_rq->cpu_load = (old_load + this_load) / 2;
+	/* Update our load */
+	for (i = 0; i < 3; i++) {
+		unsigned long new_load = this_load;
+		int scale = 1 << i;
+		old_load = this_rq->cpu_load[i];
+		/*
+		 * Round up the averaging division if load is increasing. This
+		 * prevents us from getting stuck on 9 if the load is 10, for
+		 * example.
+		 */
+		if (new_load > old_load)
+			new_load += scale-1;
+		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+	}
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -4921,13 +4929,15 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		rq->nr_running = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
 		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_dummy;
-		rq->cpu_load = 0;
+		for (j = 1; j < 3; j++)
+			rq->cpu_load[j] = 0;
 		rq->active_balance = 0;
 		rq->push_cpu = 0;
 		rq->migration_thread = NULL;
-- 
cgit v1.2.3-59-g8ed1b


From cafb20c1f9976a70d633bb1e1c8c24eab00e4e80 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:17 -0700
Subject: [PATCH] sched: no aggressive idle balancing

Remove the very aggressive idle stuff that has recently gone into 2.6 - it is
going against the direction we are trying to go.  Hopefully we can regain
performance through other methods.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/topology.h   |  1 -
 include/asm-x86_64/topology.h |  1 -
 include/linux/topology.h      |  1 -
 kernel/sched.c                | 21 ++-------------------
 4 files changed, 2 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 0055fbfeec7b..5eb6f61dcefc 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -82,7 +82,6 @@ static inline int node_to_first_cpu(int node)
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index fe8d80a15751..9cb7459ce722 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -48,7 +48,6 @@ extern int __node_distance(int, int);
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index ae9c2216dfa6..b23ec64df7f1 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -127,7 +127,6 @@
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 5ae3568eed0b..396724a2519f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -414,22 +414,6 @@ static inline runqueue_t *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
-	int sib;
-	for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
-		if (idle_cpu(sib))
-			continue;
-		return 0;
-	}
-
-	return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
 #ifdef CONFIG_SCHEDSTATS
 /*
  * Called when a process is dequeued from the active array and given
@@ -1652,12 +1636,11 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 
 	/*
 	 * Aggressive migration if:
-	 * 1) the [whole] cpu is idle, or
+	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (cpu_and_siblings_are_idle(this_cpu) || \
-			sd->nr_balance_failed > sd->cache_nice_tries)
+	if (sd->nr_balance_failed > sd->cache_nice_tries)
 		return 1;
 
 	if (task_hot(p, rq->timestamp_last_tick, sd))
-- 
cgit v1.2.3-59-g8ed1b


From 147cbb4bbe991452698f0772d8292f22825710ba Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:19 -0700
Subject: [PATCH] sched: balance on fork

Reimplement the balance on exec balancing to be sched-domains aware.  Use this
to also do balance on fork balancing.  Make x86_64 do balance on fork over the
NUMA domain.

The problem that the non sched domains aware blancing became apparent on dual
core, multi socket opterons.  What we want is for the new tasks to be sent to
a different socket, but more often than not, we would first load up our
sibling core, or fill two cores of a single remote socket before selecting a
new one.

This gives large improvements to STREAM on such systems.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-x86_64/topology.h |   2 +
 include/linux/sched.h         |  10 +--
 include/linux/topology.h      |   2 +
 kernel/sched.c                | 164 ++++++++++++++++++++++++++++--------------
 4 files changed, 119 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 9cb7459ce722..802d09b9c99f 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -44,9 +44,11 @@ extern int __node_distance(int, int);
 	.idle_idx		= 2,			\
 	.newidle_idx		= 1, 			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 664981ac1fb6..613491d3a875 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -460,10 +460,11 @@ enum idle_type
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
 #define SD_BALANCE_NEWIDLE	2	/* Balance when about to become idle */
 #define SD_BALANCE_EXEC		4	/* Balance on exec */
-#define SD_WAKE_IDLE		8	/* Wake to idle CPU on task wakeup */
-#define SD_WAKE_AFFINE		16	/* Wake task to waking CPU */
-#define SD_WAKE_BALANCE		32	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	64	/* Domain members share cpu power */
+#define SD_BALANCE_FORK		8	/* Balance on fork, clone */
+#define SD_WAKE_IDLE		16	/* Wake to idle CPU on task wakeup */
+#define SD_WAKE_AFFINE		32	/* Wake task to waking CPU */
+#define SD_WAKE_BALANCE		64	/* Perform balancing at task wakeup */
+#define SD_SHARE_CPUPOWER	128	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -492,6 +493,7 @@ struct sched_domain {
 	unsigned int idle_idx;
 	unsigned int newidle_idx;
 	unsigned int wake_idx;
+	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index b23ec64df7f1..665597207def 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -93,6 +93,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 0,			\
 	.wake_idx		= 0,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
@@ -123,6 +124,7 @@
 	.idle_idx		= 0,			\
 	.newidle_idx		= 1,			\
 	.wake_idx		= 1,			\
+	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
diff --git a/kernel/sched.c b/kernel/sched.c
index 396724a2519f..7ecc237e2aab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -893,6 +893,79 @@ static inline unsigned long target_load(int cpu, int type)
 	return max(rq->cpu_load[type-1], load_now);
 }
 
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+	unsigned long min_load = ULONG_MAX, this_load = 0;
+	int load_idx = sd->forkexec_idx;
+	int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+	do {
+		unsigned long load, avg_load;
+		int local_group;
+		int i;
+
+		local_group = cpu_isset(this_cpu, group->cpumask);
+		/* XXX: put a cpus allowed check */
+
+		/* Tally up the load of all CPUs in the group */
+		avg_load = 0;
+
+		for_each_cpu_mask(i, group->cpumask) {
+			/* Bias balancing toward cpus of our domain */
+			if (local_group)
+				load = source_load(i, load_idx);
+			else
+				load = target_load(i, load_idx);
+
+			avg_load += load;
+		}
+
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+		if (local_group) {
+			this_load = avg_load;
+			this = group;
+		} else if (avg_load < min_load) {
+			min_load = avg_load;
+			idlest = group;
+		}
+		group = group->next;
+	} while (group != sd->groups);
+
+	if (!idlest || 100*this_load < imbalance*min_load)
+		return NULL;
+	return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+	unsigned long load, min_load = ULONG_MAX;
+	int idlest = -1;
+	int i;
+
+	for_each_cpu_mask(i, group->cpumask) {
+		load = source_load(i, 0);
+
+		if (load < min_load || (load == min_load && i == this_cpu)) {
+			min_load = load;
+			idlest = i;
+		}
+	}
+
+	return idlest;
+}
+
+
 #endif
 
 /*
@@ -1107,11 +1180,6 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }
 
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd);
-#endif
-
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1181,12 +1249,38 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
+#ifdef CONFIG_SMP
+	struct sched_domain *tmp, *sd = NULL;
+#endif
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
+	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
+	cpu = task_cpu(p);
 
-	BUG_ON(p->state != TASK_RUNNING);
+#ifdef CONFIG_SMP
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & SD_BALANCE_FORK)
+			sd = tmp;
+
+	if (sd) {
+		struct sched_group *group;
+
+		cpu = task_cpu(p);
+		group = find_idlest_group(sd, p, cpu);
+		if (group) {
+			int new_cpu;
+			new_cpu = find_idlest_cpu(group, cpu);
+			if (new_cpu != -1 && new_cpu != cpu &&
+					cpu_isset(new_cpu, p->cpus_allowed)) {
+				set_task_cpu(p, new_cpu);
+				task_rq_unlock(rq, &flags);
+				rq = task_rq_lock(p, &flags);
+				cpu = task_cpu(p);
+			}
+		}
+	}
+#endif
 
 	/*
 	 * We decrease the sleep average of forking parents
@@ -1480,51 +1574,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 	}
 }
 
-/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
-			   struct sched_domain *sd)
-{
-	unsigned long load, min_load, this_load;
-	int i, min_cpu;
-	cpumask_t mask;
-
-	min_cpu = UINT_MAX;
-	min_load = ULONG_MAX;
-
-	cpus_and(mask, sd->span, p->cpus_allowed);
-
-	for_each_cpu_mask(i, mask) {
-		load = target_load(i, sd->wake_idx);
-
-		if (load < min_load) {
-			min_cpu = i;
-			min_load = load;
-
-			/* break out early on an idle CPU: */
-			if (!min_load)
-				break;
-		}
-	}
-
-	/* add +1 to account for the new task */
-	this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
-
-	/*
-	 * Would with the addition of the new task to the
-	 * current CPU there be an imbalance between this
-	 * CPU and the idlest CPU?
-	 *
-	 * Use half of the balancing threshold - new-context is
-	 * a good opportunity to balance.
-	 */
-	if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
-		return min_cpu;
-
-	return this_cpu;
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1578,8 +1627,15 @@ void sched_exec(void)
 			sd = tmp;
 
 	if (sd) {
+		struct sched_group *group;
 		schedstat_inc(sd, sbe_attempts);
-		new_cpu = find_idlest_cpu(current, this_cpu, sd);
+		group = find_idlest_group(sd, current, this_cpu);
+		if (!group)
+			goto out;
+		new_cpu = find_idlest_cpu(group, this_cpu);
+		if (new_cpu == -1)
+			goto out;
+
 		if (new_cpu != this_cpu) {
 			schedstat_inc(sd, sbe_pushed);
 			put_cpu();
@@ -1792,12 +1848,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (local_group) {
 			this_load = avg_load;
 			this = group;
-			goto nextgroup;
 		} else if (avg_load > max_load) {
 			max_load = avg_load;
 			busiest = group;
 		}
-nextgroup:
 		group = group->next;
 	} while (group != sd->groups);
 
-- 
cgit v1.2.3-59-g8ed1b


From 68767a0ae428801649d510d9a65bb71feed44dd1 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:20 -0700
Subject: [PATCH] sched: schedstats update for balance on fork

Add SCHEDSTAT statistics for sched-balance-fork.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 10 ++++++--
 kernel/sched.c        | 63 +++++++++++++++++++++++++++++----------------------
 2 files changed, 44 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 613491d3a875..36a10781c3f3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -517,10 +517,16 @@ struct sched_domain {
 	unsigned long alb_failed;
 	unsigned long alb_pushed;
 
-	/* sched_balance_exec() stats */
-	unsigned long sbe_attempts;
+	/* SD_BALANCE_EXEC stats */
+	unsigned long sbe_cnt;
+	unsigned long sbe_balanced;
 	unsigned long sbe_pushed;
 
+	/* SD_BALANCE_FORK stats */
+	unsigned long sbf_cnt;
+	unsigned long sbf_balanced;
+	unsigned long sbf_pushed;
+
 	/* try_to_wake_up() stats */
 	unsigned long ttwu_wake_remote;
 	unsigned long ttwu_move_affine;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7ecc237e2aab..2711130cd973 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,7 +309,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -356,9 +356,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyq[itype],
 				    sd->lb_nobusyg[itype]);
 			}
-			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+			seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
 			    sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
-			    sd->sbe_pushed, sd->sbe_attempts,
+			    sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+			    sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
 		}
 #endif
@@ -1264,24 +1265,34 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 			sd = tmp;
 
 	if (sd) {
+		int new_cpu;
 		struct sched_group *group;
 
+		schedstat_inc(sd, sbf_cnt);
 		cpu = task_cpu(p);
 		group = find_idlest_group(sd, p, cpu);
-		if (group) {
-			int new_cpu;
-			new_cpu = find_idlest_cpu(group, cpu);
-			if (new_cpu != -1 && new_cpu != cpu &&
-					cpu_isset(new_cpu, p->cpus_allowed)) {
-				set_task_cpu(p, new_cpu);
-				task_rq_unlock(rq, &flags);
-				rq = task_rq_lock(p, &flags);
-				cpu = task_cpu(p);
-			}
+		if (!group) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu) {
+			schedstat_inc(sd, sbf_balanced);
+			goto no_forkbalance;
+		}
+
+		if (cpu_isset(new_cpu, p->cpus_allowed)) {
+			schedstat_inc(sd, sbf_pushed);
+			set_task_cpu(p, new_cpu);
+			task_rq_unlock(rq, &flags);
+			rq = task_rq_lock(p, &flags);
+			cpu = task_cpu(p);
 		}
 	}
-#endif
 
+no_forkbalance:
+#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1618,30 +1629,28 @@ void sched_exec(void)
 	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
 
-	/* Prefer the current CPU if there's only this task running */
-	if (this_rq()->nr_running <= 1)
-		goto out;
-
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_BALANCE_EXEC)
 			sd = tmp;
 
 	if (sd) {
 		struct sched_group *group;
-		schedstat_inc(sd, sbe_attempts);
+		schedstat_inc(sd, sbe_cnt);
 		group = find_idlest_group(sd, current, this_cpu);
-		if (!group)
+		if (!group) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
+		}
 		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1)
+		if (new_cpu == -1 || new_cpu == this_cpu) {
+			schedstat_inc(sd, sbe_balanced);
 			goto out;
-
-		if (new_cpu != this_cpu) {
-			schedstat_inc(sd, sbe_pushed);
-			put_cpu();
-			sched_migrate_task(current, new_cpu);
-			return;
 		}
+
+		schedstat_inc(sd, sbe_pushed);
+		put_cpu();
+		sched_migrate_task(current, new_cpu);
+		return;
 	}
 out:
 	put_cpu();
-- 
cgit v1.2.3-59-g8ed1b


From 687f1661d302bc70ce906594a6d3f615ef075a50 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:21 -0700
Subject: [PATCH] sched: sched tuning

Do some basic initial tuning.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/domain.c     |  2 +-
 include/asm-i386/topology.h   |  2 +-
 include/asm-ia64/topology.h   | 61 +++++++++++++++++++++++++++++++++----------
 include/asm-x86_64/topology.h |  3 +--
 include/linux/topology.h      | 11 ++++----
 5 files changed, 55 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index fe532c970438..afbde79c3b3d 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -14,7 +14,7 @@
 #include <linux/topology.h>
 #include <linux/nodemask.h>
 
-#define SD_NODES_PER_DOMAIN 6
+#define SD_NODES_PER_DOMAIN 16
 
 #ifdef CONFIG_NUMA
 /**
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index 5eb6f61dcefc..2461b731781e 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -81,7 +81,7 @@ static inline int node_to_first_cpu(int node)
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
diff --git a/include/asm-ia64/topology.h b/include/asm-ia64/topology.h
index 21cf351fd05c..4e64c2a6b369 100644
--- a/include/asm-ia64/topology.h
+++ b/include/asm-ia64/topology.h
@@ -42,25 +42,54 @@
 
 void build_cpu_to_node_map(void);
 
+#define SD_CPU_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 1,			\
+	.max_interval		= 4,			\
+	.busy_factor		= 64,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.per_cpu_gain		= 100,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 2,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
+	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
+	.flags			= SD_LOAD_BALANCE	\
+				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_EXEC	\
+				| SD_WAKE_AFFINE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
+
 /* sched_domains SD_NODE_INIT for IA64 NUMA machines */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
 	.parent			= NULL,			\
 	.groups			= NULL,			\
-	.min_interval		= 80,			\
-	.max_interval		= 320,			\
-	.busy_factor		= 320,			\
+	.min_interval		= 8,			\
+	.max_interval		= 8*(min(num_online_cpus(), 32)), \
+	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_hot_time		= (10*1000000),		\
-	.cache_nice_tries	= 1,			\
+	.cache_nice_tries	= 2,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 2,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 1,			\
+	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_EXEC	\
-				| SD_BALANCE_NEWIDLE	\
-				| SD_WAKE_IDLE		\
+				| SD_BALANCE_FORK	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
-	.balance_interval	= 1,			\
+	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
 }
 
@@ -69,17 +98,21 @@ void build_cpu_to_node_map(void);
 	.span			= CPU_MASK_NONE,	\
 	.parent			= NULL,			\
 	.groups			= NULL,			\
-	.min_interval		= 80,			\
-	.max_interval		= 320,			\
-	.busy_factor		= 320,			\
-	.imbalance_pct		= 125,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_EXEC,	\
+	.flags			= SD_LOAD_BALANCE,	\
 	.last_balance		= jiffies,		\
-	.balance_interval	= 100*(63+num_online_cpus())/64,   \
+	.balance_interval	= 64,			\
 	.nr_balance_failed	= 0,			\
 }
 
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index 802d09b9c99f..c1bc3fad482e 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -42,12 +42,11 @@ extern int __node_distance(int, int);
 	.cache_nice_tries	= 2,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 2,			\
-	.newidle_idx		= 1, 			\
+	.newidle_idx		= 0, 			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.per_cpu_gain		= 100,			\
 	.flags			= SD_LOAD_BALANCE	\
-				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 665597207def..0320225e96da 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -91,7 +91,7 @@
 	.per_cpu_gain		= 25,			\
 	.busy_idx		= 0,			\
 	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
+	.newidle_idx		= 1,			\
 	.wake_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
@@ -121,15 +121,14 @@
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 1,			\
+	.idle_idx		= 1,			\
+	.newidle_idx		= 2,			\
 	.wake_idx		= 1,			\
-	.forkexec_idx		= 0,			\
+	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
-				| SD_WAKE_AFFINE	\
-				| SD_WAKE_BALANCE,	\
+				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
-- 
cgit v1.2.3-59-g8ed1b


From 4866cde064afbb6c2a488c265e696879de616daa Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:23 -0700
Subject: [PATCH] sched: cleanup context switch locking

Instead of requiring architecture code to interact with the scheduler's
locking implementation, provide a couple of defines that can be used by the
architecture to request runqueue unlocked context switches, and ask for
interrupts to be enabled over the context switch.

Also replaces the "switch_lock" used by these architectures with an oncpu
flag (note, not a potentially slow bitflag).  This eliminates one bus
locked memory operation when context switching, and simplifies the
task_running function.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-arm/system.h     |  30 ++--------
 include/asm-ia64/system.h    |  10 +---
 include/asm-mips/system.h    |  10 +---
 include/asm-s390/system.h    |  17 +-----
 include/asm-sparc/system.h   |   4 +-
 include/asm-sparc64/system.h |  14 ++---
 include/linux/init_task.h    |   1 -
 include/linux/sched.h        |  10 +++-
 kernel/sched.c               | 132 +++++++++++++++++++++++++++++++++++--------
 9 files changed, 131 insertions(+), 97 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-arm/system.h b/include/asm-arm/system.h
index 39dd7008013c..3d0d2860b6db 100644
--- a/include/asm-arm/system.h
+++ b/include/asm-arm/system.h
@@ -145,34 +145,12 @@ extern unsigned int user_debug;
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t");
 
-#ifdef CONFIG_SMP
 /*
- * Define our own context switch locking.  This allows us to enable
- * interrupts over the context switch, otherwise we end up with high
- * interrupt latency.  The real problem area is switch_mm() which may
- * do a full cache flush.
+ * switch_mm() may do a full cache flush over the context switch,
+ * so enable interrupts over the context switch to avoid high
+ * latency.
  */
-#define prepare_arch_switch(rq,next)					\
-do {									\
-	spin_lock(&(next)->switch_lock);				\
-	spin_unlock_irq(&(rq)->lock);					\
-} while (0)
-
-#define finish_arch_switch(rq,prev)					\
-	spin_unlock(&(prev)->switch_lock)
-
-#define task_running(rq,p)						\
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-#else
-/*
- * Our UP-case is more simple, but we assume knowledge of how
- * spin_unlock_irq() and friends are implemented.  This avoids
- * us needlessly decrementing and incrementing the preempt count.
- */
-#define prepare_arch_switch(rq,next)	local_irq_enable()
-#define finish_arch_switch(rq,prev)	spin_unlock(&(rq)->lock)
-#define task_running(rq,p)		((rq)->curr == (p))
-#endif
+#define __ARCH_WANT_INTERRUPTS_ON_CTXSW
 
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
diff --git a/include/asm-ia64/system.h b/include/asm-ia64/system.h
index 6f516e76d1f0..cd2cf76b2db1 100644
--- a/include/asm-ia64/system.h
+++ b/include/asm-ia64/system.h
@@ -183,8 +183,6 @@ do {								\
 
 #ifdef __KERNEL__
 
-#define prepare_to_switch()    do { } while(0)
-
 #ifdef CONFIG_IA32_SUPPORT
 # define IS_IA32_PROCESS(regs)	(ia64_psr(regs)->is != 0)
 #else
@@ -274,13 +272,7 @@ extern void ia64_load_extra (struct task_struct *task);
  * of that CPU which will not be released, because there we wait for the
  * tasklist_lock to become available.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h
index 888fd8908467..169f3d4265b1 100644
--- a/include/asm-mips/system.h
+++ b/include/asm-mips/system.h
@@ -422,16 +422,10 @@ extern void __die_if_kernel(const char *, struct pt_regs *, const char *file,
 extern int stop_a_enabled;
 
 /*
- * Taken from include/asm-ia64/system.h; prevents deadlock on SMP
+ * See include/asm-ia64/system.h; prevents deadlock on SMP
  * systems.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define arch_align_stack(x) (x)
 
diff --git a/include/asm-s390/system.h b/include/asm-s390/system.h
index e3cb3ce1d24a..b4a9f05a93d6 100644
--- a/include/asm-s390/system.h
+++ b/include/asm-s390/system.h
@@ -104,29 +104,18 @@ static inline void restore_access_regs(unsigned int *acrs)
 	prev = __switch_to(prev,next);					     \
 } while (0)
 
-#define prepare_arch_switch(rq, next)	do { } while(0)
-#define task_running(rq, p)		((rq)->curr == (p))
-
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 extern void account_user_vtime(struct task_struct *);
 extern void account_system_vtime(struct task_struct *);
-
-#define finish_arch_switch(rq, prev) do {				     \
-	set_fs(current->thread.mm_segment);				     \
-	spin_unlock(&(rq)->lock);					     \
-	account_system_vtime(prev);					     \
-	local_irq_enable();						     \
-} while (0)
-
 #else
+#define account_system_vtime(prev) do { } while (0)
+#endif
 
 #define finish_arch_switch(rq, prev) do {				     \
 	set_fs(current->thread.mm_segment);				     \
-	spin_unlock_irq(&(rq)->lock);					     \
+	account_system_vtime(prev);					     \
 } while (0)
 
-#endif
-
 #define nop() __asm__ __volatile__ ("nop")
 
 #define xchg(ptr,x) \
diff --git a/include/asm-sparc/system.h b/include/asm-sparc/system.h
index 80cf20cfaee1..898562ebe94c 100644
--- a/include/asm-sparc/system.h
+++ b/include/asm-sparc/system.h
@@ -101,7 +101,7 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
  * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
  * XXX WTF is the above comment? Found in late teen 2.4.x.
  */
-#define prepare_arch_switch(rq, next) do { \
+#define prepare_arch_switch(next) do { \
 	__asm__ __volatile__( \
 	".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
 	"save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
@@ -109,8 +109,6 @@ extern void fpsave(unsigned long *fpregs, unsigned long *fsr,
 	"save %sp, -0x40, %sp\n\t" \
 	"restore; restore; restore; restore; restore; restore; restore"); \
 } while(0)
-#define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-#define task_running(rq, p)		((rq)->curr == (p))
 
 	/* Much care has gone into this code, do not touch it.
 	 *
diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index fd12ca386f48..f9be2c5b4dc9 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -139,19 +139,13 @@ extern void __flushw_user(void);
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all
 
-#define prepare_arch_switch(rq, next)		\
-do {	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
+/* Don't hold the runqueue lock over context switch */
+#define __ARCH_WANT_UNLOCKED_CTXSW
+#define prepare_arch_switch(next)		\
+do {						\
 	flushw_all();				\
 } while (0)
 
-#define finish_arch_switch(rq, prev)		\
-do {	spin_unlock_irq(&(prev)->switch_lock);	\
-} while (0)
-
-#define task_running(rq, p) \
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-
 	/* See what happens when you design the chip correctly?
 	 *
 	 * We tell gcc we clobber all non-fixed-usage registers except
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a6a8c1a38d5e..03206a425d7a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,7 +108,6 @@ extern struct group_info init_groups;
 	.blocked	= {{0}},					\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 }
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 36a10781c3f3..d27be9337425 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -368,6 +368,11 @@ struct signal_struct {
 #endif
 };
 
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
 /*
  * Bits in flags field of signal_struct.
  */
@@ -594,6 +599,9 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	int oncpu;
+#endif
 	int prio, static_prio;
 	struct list_head run_list;
 	prio_array_t *array;
@@ -716,8 +724,6 @@ struct task_struct {
 	spinlock_t alloc_lock;
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
-/* context-switch lock */
-	spinlock_t switch_lock;
 
 /* journalling filesystem info */
 	void *journal_info;
diff --git a/kernel/sched.c b/kernel/sched.c
index 98bf1c091da5..b1410577f9a8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -268,14 +268,71 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next)	do { } while (0)
-# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
 #endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -1196,17 +1253,14 @@ void fastcall sched_fork(task_t *p)
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-	/*
-	 * During context-switch we hold precisely one spinlock, which
-	 * schedule_tail drops. (in the common case it's this_rq()->lock,
-	 * but it also can be p->switch_lock.) So we compensate with a count
-	 * of 1. Also, we want to start with kernel preemption disabled.
-	 */
+	/* Want to start with kernel preemption disabled. */
 	p->thread_info->preempt_count = 1;
 #endif
 	/*
@@ -1387,23 +1441,41 @@ void fastcall sched_exit(task_t * p)
 	task_rq_unlock(rq, &flags);
 }
 
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
  *
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
  *
  * Note that we may have delayed dropping an mm in context_switch(). If
  * so, we finish that here outside of the runqueue lock.  (Doing it
  * with the lock held can cause deadlocks; see schedule() for
  * details.)
  */
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
 	struct mm_struct *mm = rq->prev_mm;
 	unsigned long prev_task_flags;
 
@@ -1421,7 +1493,8 @@ static inline void finish_task_switch(task_t *prev)
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(rq, prev);
+	finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
@@ -1435,8 +1508,12 @@ static inline void finish_task_switch(task_t *prev)
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	finish_task_switch(prev);
-
+	runqueue_t *rq = this_rq();
+	finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -2816,11 +2893,15 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
-		prepare_arch_switch(rq, next);
+		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
-
-		finish_task_switch(prev);
+		/*
+		 * this_rq must be evaluated again because prev may have moved
+		 * CPUs since it called schedule(), thus the 'rq' on its stack
+		 * frame will be invalid.
+		 */
+		finish_task_switch(this_rq(), prev);
 	} else
 		spin_unlock_irq(&rq->lock);
 
@@ -4085,6 +4166,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
-- 
cgit v1.2.3-59-g8ed1b


From 476d139c218e44e045e4bc6d4cc02b010b343939 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 25 Jun 2005 14:57:29 -0700
Subject: [PATCH] sched: consolidate sbe sbf

Consolidate balance-on-exec with balance-on-fork.  This is made easy by the
sched-domains RCU patches.

As well as the general goodness of code reduction, this allows the runqueues
to be unlocked during balance-on-fork.

schedstats is a problem.  Maybe just have balance-on-event instead of
distinguishing fork and exec?

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +-
 kernel/fork.c         |  21 +++---
 kernel/sched.c        | 174 ++++++++++++++++++++------------------------------
 3 files changed, 81 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d27be9337425..edb2c69a8873 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -930,7 +930,7 @@ extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 #else
  static inline void kick_process(struct task_struct *tsk) { }
 #endif
-extern void FASTCALL(sched_fork(task_t * p));
+extern void FASTCALL(sched_fork(task_t * p, int clone_flags));
 extern void FASTCALL(sched_exit(task_t * p));
 
 extern int in_group_p(gid_t);
diff --git a/kernel/fork.c b/kernel/fork.c
index a28d11e10877..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1003,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	p->pdeath_signal = 0;
 	p->exit_state = 0;
 
-	/* Perform scheduler related setup */
-	sched_fork(p);
-
 	/*
 	 * Ok, make it visible to the rest of the system.
 	 * We dont wake it up yet.
@@ -1014,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->ptrace_children);
 	INIT_LIST_HEAD(&p->ptrace_list);
 
+	/* Perform scheduler related setup. Assign this task to a CPU. */
+	sched_fork(p, clone_flags);
+
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
 	/*
-	 * The task hasn't been attached yet, so cpus_allowed mask cannot
-	 * have changed. The cpus_allowed mask of the parent may have
-	 * changed after it was copied first time, and it may then move to
-	 * another CPU - so we re-copy it here and set the child's CPU to
-	 * the parent's CPU. This avoids alot of nasty races.
+	 * The task hasn't been attached yet, so its cpus_allowed mask will
+	 * not be changed, nor will its assigned CPU.
+	 *
+	 * The cpus_allowed mask of the parent may have changed after it was
+	 * copied first time - so re-copy it here, then check the child's CPU
+	 * to ensure it is on a valid CPU (and if not, just force it back to
+	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	p->cpus_allowed = current->cpus_allowed;
-	set_task_cpu(p, smp_processor_id());
+	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+		set_task_cpu(p, smp_processor_id());
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/sched.c b/kernel/sched.c
index 54ce787b6207..579da278e72f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1021,8 +1021,59 @@ static int find_idlest_cpu(struct sched_group *group, int this_cpu)
 	return idlest;
 }
 
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+	struct task_struct *t = current;
+	struct sched_domain *tmp, *sd = NULL;
 
-#endif
+	for_each_domain(cpu, tmp)
+		if (tmp->flags & flag)
+			sd = tmp;
+
+	while (sd) {
+		cpumask_t span;
+		struct sched_group *group;
+		int new_cpu;
+		int weight;
+
+		span = sd->span;
+		group = find_idlest_group(sd, t, cpu);
+		if (!group)
+			goto nextlevel;
+
+		new_cpu = find_idlest_cpu(group, cpu);
+		if (new_cpu == -1 || new_cpu == cpu)
+			goto nextlevel;
+
+		/* Now try balancing at a lower domain level */
+		cpu = new_cpu;
+nextlevel:
+		sd = NULL;
+		weight = cpus_weight(span);
+		for_each_domain(cpu, tmp) {
+			if (weight <= cpus_weight(tmp->span))
+				break;
+			if (tmp->flags & flag)
+				sd = tmp;
+		}
+		/* while loop will break here if sd == NULL */
+	}
+
+	return cpu;
+}
+
+#endif /* CONFIG_SMP */
 
 /*
  * wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -1240,8 +1291,15 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
  */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
 {
+	int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+	cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+	set_task_cpu(p, cpu);
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1282,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
 		 * runqueue lock is not a problem.
 		 */
 		current->time_slice = 1;
-		preempt_disable();
 		scheduler_tick();
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();
+	put_cpu();
 }
 
 /*
@@ -1302,64 +1358,12 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 	unsigned long flags;
 	int this_cpu, cpu;
 	runqueue_t *rq, *this_rq;
-#ifdef CONFIG_SMP
-	struct sched_domain *tmp, *sd = NULL;
-#endif
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-#ifdef CONFIG_SMP
-	for_each_domain(cpu, tmp)
-		if (tmp->flags & SD_BALANCE_FORK)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		int new_cpu;
-		struct sched_group *group;
-
-again:
-		schedstat_inc(sd, sbf_cnt);
-		span = sd->span;
-		cpu = task_cpu(p);
-		group = find_idlest_group(sd, p, cpu);
-		if (!group) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		new_cpu = find_idlest_cpu(group, cpu);
-		if (new_cpu == -1 || new_cpu == cpu) {
-			schedstat_inc(sd, sbf_balanced);
-			goto nextlevel;
-		}
-
-		if (cpu_isset(new_cpu, p->cpus_allowed)) {
-			schedstat_inc(sd, sbf_pushed);
-			set_task_cpu(p, new_cpu);
-			task_rq_unlock(rq, &flags);
-			rq = task_rq_lock(p, &flags);
-			cpu = task_cpu(p);
-		}
-
-		/* Now try balancing at a lower domain level */
-nextlevel:
-		sd = NULL;
-		for_each_domain(cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_FORK)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
-#endif
 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
@@ -1708,58 +1712,16 @@ out:
 }
 
 /*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
  */
 void sched_exec(void)
 {
-	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();
-
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_BALANCE_EXEC)
-			sd = tmp;
-
-	if (sd) {
-		cpumask_t span;
-		struct sched_group *group;
-again:
-		schedstat_inc(sd, sbe_cnt);
-		span = sd->span;
-		group = find_idlest_group(sd, current, this_cpu);
-		if (!group) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-		new_cpu = find_idlest_cpu(group, this_cpu);
-		if (new_cpu == -1 || new_cpu == this_cpu) {
-			schedstat_inc(sd, sbe_balanced);
-			goto nextlevel;
-		}
-
-		schedstat_inc(sd, sbe_pushed);
-		put_cpu();
-		sched_migrate_task(current, new_cpu);
-
-		/* Now try balancing at a lower domain level */
-		this_cpu = get_cpu();
-nextlevel:
-		sd = NULL;
-		for_each_domain(this_cpu, tmp) {
-			if (cpus_subset(span, tmp->span))
-				break;
-			if (tmp->flags & SD_BALANCE_EXEC)
-				sd = tmp;
-		}
-
-		if (sd)
-			goto again;
-	}
-
+	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
 	put_cpu();
+	if (new_cpu != this_cpu)
+		sched_migrate_task(current, new_cpu);
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 1a20ff27ef75d866730ee796acd811a925af762f Mon Sep 17 00:00:00 2001
From: Dinakar Guniguntala <dino@in.ibm.com>
Date: Sat, 25 Jun 2005 14:57:33 -0700
Subject: [PATCH] Dynamic sched domains: sched changes

The following patches add dynamic sched domains functionality that was
extensively discussed on lkml and lse-tech.  I would like to see this added to
-mm

o The main advantage with this feature is that it ensures that the scheduler
  load balacing code only balances against the cpus that are in the sched
  domain as defined by an exclusive cpuset and not all of the cpus in the
  system. This removes any overhead due to load balancing code trying to
  pull tasks outside of the cpu exclusive cpuset only to be prevented by
  the tasks' cpus_allowed mask.
o cpu exclusive cpusets are useful for servers running orthogonal
  workloads such as RT applications requiring low latency and HPC
  applications that are throughput sensitive

o It provides a new API partition_sched_domains in sched.c
  that makes dynamic sched domains possible.
o cpu_exclusive cpusets sets are now associated with a sched domain.
  Which means that the users can dynamically modify the sched domains
  through the cpuset file system interface
o ia64 sched domain code has been updated to support this feature as well
o Currently, this does not support hotplug. (However some of my tests
  indicate hotplug+preempt is currently broken)
o I have tested it extensively on x86.
o This should have very minimal impact on performance as none of
  the fast paths are affected

Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Acked-by: Paul Jackson <pj@sgi.com>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Acked-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +
 kernel/sched.c        | 132 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 88 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index edb2c69a8873..98c109e4f43d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -539,6 +539,8 @@ struct sched_domain {
 #endif
 };
 
+extern void partition_sched_domains(cpumask_t *partition1,
+				    cpumask_t *partition2);
 #ifdef ARCH_HAS_SCHED_DOMAIN
 /* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
 extern cpumask_t cpu_isolated_map;
diff --git a/kernel/sched.c b/kernel/sched.c
index d3d81b82e378..dee96b22635e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 
 /*
  * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See update_sched_domains: synchronize_kernel for details.
+ * See detach_destroy_domains: synchronize_sched for details.
  *
  * The domain tree of any CPU may only be accessed from within
  * preempt-disabled sections.
@@ -4624,7 +4624,7 @@ int __init migration_init(void)
 #endif
 
 #ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
@@ -4717,7 +4717,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 #define sched_domain_debug(sd, cpu) {}
 #endif
 
-static int __devinit sd_degenerate(struct sched_domain *sd)
+static int sd_degenerate(struct sched_domain *sd)
 {
 	if (cpus_weight(sd->span) == 1)
 		return 1;
@@ -4740,7 +4740,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd)
 	return 1;
 }
 
-static int __devinit sd_parent_degenerate(struct sched_domain *sd,
+static int sd_parent_degenerate(struct sched_domain *sd,
 						struct sched_domain *parent)
 {
 	unsigned long cflags = sd->flags, pflags = parent->flags;
@@ -4772,7 +4772,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd,
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4823,7 +4823,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
 			cpumask_t span, int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
@@ -4859,13 +4859,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
 
 
 #ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
 #else
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -4873,7 +4874,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -4886,7 +4887,7 @@ static int __devinit cpu_to_phys_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
@@ -4917,39 +4918,28 @@ static void check_sibling_maps(void)
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
 
 	/*
-	 * Set up domains. Isolated domains just stay on the NULL domain.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		sd = &per_cpu(node_domains, i);
 		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = cpu_default_map;
+		sd->span = *cpu_map;
 		sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4967,7 +4957,7 @@ static void __devinit arch_init_sched_domains(void)
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -4977,7 +4967,7 @@ static void __devinit arch_init_sched_domains(void)
 	/* Set up CPU (sibling) groups */
 	for_each_online_cpu(i) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -4990,7 +4980,7 @@ static void __devinit arch_init_sched_domains(void)
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -5000,12 +4990,12 @@ static void __devinit arch_init_sched_domains(void)
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, cpu_default_map,
+	init_sched_build_groups(sched_group_nodes, *cpu_map,
 					&cpu_to_node_group);
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -5029,7 +5019,7 @@ static void __devinit arch_init_sched_domains(void)
 	}
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -5039,16 +5029,71 @@ static void __devinit arch_init_sched_domains(void)
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+	check_sibling_maps();
+#endif
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
+
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 	/* Do nothing: everything is statically allocated. */
 }
-#endif
 
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+	int i;
+
+	for_each_cpu_mask(i, *cpu_map)
+		cpu_attach_domain(NULL, i);
+	synchronize_sched();
+	arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+	cpumask_t change_map;
+
+	cpus_and(*partition1, *partition1, cpu_online_map);
+	cpus_and(*partition2, *partition2, cpu_online_map);
+	cpus_or(change_map, *partition1, *partition2);
+
+	/* Detach sched domains from all of the affected cpus */
+	detach_destroy_domains(&change_map);
+	if (!cpus_empty(*partition1))
+		build_sched_domains(partition1);
+	if (!cpus_empty(*partition2))
+		build_sched_domains(partition2);
+}
+
 #ifdef CONFIG_HOTPLUG_CPU
 /*
  * Force a reinitialization of the sched domains hierarchy.  The domains
@@ -5059,15 +5104,10 @@ static void __devinit arch_destroy_sched_domains(void)
 static int update_sched_domains(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
-	int i;
-
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_DOWN_PREPARE:
-		for_each_online_cpu(i)
-			cpu_attach_domain(NULL, i);
-		synchronize_kernel();
-		arch_destroy_sched_domains();
+		detach_destroy_domains(&cpu_online_map);
 		return NOTIFY_OK;
 
 	case CPU_UP_CANCELED:
@@ -5083,7 +5123,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 	}
 
 	/* The hotplug lock is already held by cpu_up/cpu_down */
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 
 	return NOTIFY_OK;
 }
@@ -5092,7 +5132,7 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
 	lock_cpu_hotplug();
-	arch_init_sched_domains();
+	arch_init_sched_domains(&cpu_online_map);
 	unlock_cpu_hotplug();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
-- 
cgit v1.2.3-59-g8ed1b


From f8cbd99bd3a023db8d6356d19a5f6f539d367327 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Jun 2005 14:57:39 -0700
Subject: [PATCH] sched: voluntary kernel preemption

This patch adds a new preemption model: 'Voluntary Kernel Preemption'.  The
3 models can be selected from a new menu:

            (X) No Forced Preemption (Server)
            ( ) Voluntary Kernel Preemption (Desktop)
            ( ) Preemptible Kernel (Low-Latency Desktop)

we still default to the stock (Server) preemption model.

Voluntary preemption works by adding a cond_resched()
(reschedule-if-needed) call to every might_sleep() check.  It is lighter
than CONFIG_PREEMPT - at the cost of not having as tight latencies.  It
represents a different latency/complexity/overhead tradeoff.

It has no runtime impact at all if disabled.  Here are size stats that show
how the various preemption models impact the kernel's size:

    text    data     bss     dec     hex filename
 3618774  547184  179896 4345854  424ffe vmlinux.stock
 3626406  547184  179896 4353486  426dce vmlinux.voluntary   +0.2%
 3748414  548640  179896 4476950  445016 vmlinux.preempt     +3.5%

voluntary-preempt is +0.2% of .text, preempt is +3.5%.

This feature has been tested for many months by lots of people (and it's
also included in the RHEL4 distribution and earlier variants were in Fedora
as well), and it's intended for users and distributions who dont want to
use full-blown CONFIG_PREEMPT for one reason or another.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kernel.h | 18 +++++++++++-----
 kernel/Kconfig.preempt | 57 +++++++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 62 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e25b97062ce1..687ba8c9973d 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,15 +58,23 @@ struct completion;
  * be biten later when the calling function happens to sleep when it is not
  * supposed to.
  */
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int cond_resched(void);
+# define might_resched() cond_resched()
+#else
+# define might_resched() do { } while (0)
+#endif
+
 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
-#define might_sleep() __might_sleep(__FILE__, __LINE__)
-#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
-void __might_sleep(char *file, int line);
+  void __might_sleep(char *file, int line);
+# define might_sleep() \
+	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
 #else
-#define might_sleep() do {} while(0)
-#define might_sleep_if(cond) do {} while (0)
+# define might_sleep() do { might_resched(); } while (0)
 #endif
 
+#define might_sleep_if(cond) do { if (unlikely(cond)) might_sleep(); } while (0)
+
 #define abs(x) ({				\
 		int __x = (x);			\
 		(__x < 0) ? -__x : __x;		\
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 34c631221aa3..0b46a5dff4c0 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,15 +1,56 @@
 
-config PREEMPT
-	bool "Preemptible Kernel"
+choice
+	prompt "Preemption Model"
+	default PREEMPT_NONE
+
+config PREEMPT_NONE
+	bool "No Forced Preemption (Server)"
+	help
+	  This is the traditional Linux preemption model, geared towards
+	  throughput. It will still provide good latencies most of the
+	  time, but there are no guarantees and occasional longer delays
+	  are possible.
+
+	  Select this option if you are building a kernel for a server or
+	  scientific/computation system, or if you want to maximize the
+	  raw processing power of the kernel, irrespective of scheduling
+	  latencies.
+
+config PREEMPT_VOLUNTARY
+	bool "Voluntary Kernel Preemption (Desktop)"
 	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
+	  This option reduces the latency of the kernel by adding more
+	  "explicit preemption points" to the kernel code. These new
+	  preemption points have been selected to reduce the maximum
+	  latency of rescheduling, providing faster application reactions,
+	  at the cost of slighly lower throughput.
+
+	  This allows reaction to interactive events by allowing a
+	  low priority process to voluntarily preempt itself even if it
+	  is in kernel mode executing a system call. This allows
+	  applications to run more 'smoothly' even when the system is
 	  under load.
 
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+	  Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+	bool "Preemptible Kernel (Low-Latency Desktop)"
+	help
+	  This option reduces the latency of the kernel by making
+	  all kernel code (that is not executing in a critical section)
+	  preemptible.  This allows reaction to interactive events by
+	  permitting a low priority process to be preempted involuntarily
+	  even if it is in kernel mode executing a system call and would
+	  otherwise not be about to reach a natural preemption point.
+	  This allows applications to run more 'smoothly' even when the
+	  system is under load, at the cost of slighly lower throughput
+	  and a slight runtime overhead to kernel code.
+
+	  Select this if you are building a kernel for a desktop or
+	  embedded system with latency requirements in the milliseconds
+	  range.
+
+endchoice
 
 config PREEMPT_BKL
 	bool "Preempt The Big Kernel Lock"
-- 
cgit v1.2.3-59-g8ed1b


From dc009d92435f99498cbc579ce76bf28e837e2c14 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Sat, 25 Jun 2005 14:57:52 -0700
Subject: [PATCH] kexec: add kexec syscalls

This patch introduces the architecture independent implementation the
sys_kexec_load, the compat_sys_kexec_load system calls.

Kexec on panic support has been integrated into the core patch and is
relatively clean.

In addition the hopefully architecture independent option
crashkernel=size@location has been docuemented.  It's purpose is to reserve
space for the panic kernel to live, and where no DMA transfer will ever be
setup to access.

Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |    4 +
 MAINTAINERS                         |   10 +
 include/linux/kexec.h               |  127 +++++
 include/linux/reboot.h              |    3 +
 include/linux/syscalls.h            |    5 +-
 kernel/Makefile                     |    1 +
 kernel/kexec.c                      | 1036 +++++++++++++++++++++++++++++++++++
 kernel/panic.c                      |   23 +-
 kernel/sys.c                        |   20 +
 kernel/sys_ni.c                     |    2 +
 10 files changed, 1227 insertions(+), 4 deletions(-)
 create mode 100644 include/linux/kexec.h
 create mode 100644 kernel/kexec.c

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 86db43fd6b0f..560ff5ae3fd9 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -358,6 +358,10 @@ running once the system is up.
 	cpia_pp=	[HW,PPT]
 			Format: { parport<nr> | auto | none }
 
+	crashkernel=nn[KMG]@ss[KMG]
+			[KNL] Reserve a chunk of physical memory to
+			hold a kernel to switch to with kexec on panic.
+
 	cs4232=		[HW,OSS]
 			Format: <io>,<irq>,<dma>,<dma2>,<mpuio>,<mpuirq>
 
diff --git a/MAINTAINERS b/MAINTAINERS
index dbdd8494b2e6..81728572799e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1330,6 +1330,16 @@ M:	rml@novell.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+KEXEC
+P:	Eric Biederman
+P:	Randy Dunlap
+M:	ebiederm@xmission.com
+M:	rddunlap@osdl.org
+W:	http://www.xmission.com/~ebiederm/files/kexec/
+L:	linux-kernel@vger.kernel.org
+L:	fastboot@osdl.org
+S:	Maintained
+
 LANMEDIA WAN CARD DRIVER
 P:	Andrew Stanley-Jones
 M:	asj@lanmedia.com
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
new file mode 100644
index 000000000000..e3fc35f4e35f
--- /dev/null
+++ b/include/linux/kexec.h
@@ -0,0 +1,127 @@
+#ifndef LINUX_KEXEC_H
+#define LINUX_KEXEC_H
+
+#ifdef CONFIG_KEXEC
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/linkage.h>
+#include <linux/compat.h>
+#include <asm/kexec.h>
+
+/* Verify architecture specific macros are defined */
+
+#ifndef KEXEC_SOURCE_MEMORY_LIMIT
+#error KEXEC_SOURCE_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_DESTINATION_MEMORY_LIMIT
+#error KEXEC_DESTINATION_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_CONTROL_MEMORY_LIMIT
+#error KEXEC_CONTROL_MEMORY_LIMIT not defined
+#endif
+
+#ifndef KEXEC_CONTROL_CODE_SIZE
+#error KEXEC_CONTROL_CODE_SIZE not defined
+#endif
+
+#ifndef KEXEC_ARCH
+#error KEXEC_ARCH not defined
+#endif
+
+/*
+ * This structure is used to hold the arguments that are used when loading
+ * kernel binaries.
+ */
+
+typedef unsigned long kimage_entry_t;
+#define IND_DESTINATION  0x1
+#define IND_INDIRECTION  0x2
+#define IND_DONE         0x4
+#define IND_SOURCE       0x8
+
+#define KEXEC_SEGMENT_MAX 8
+struct kexec_segment {
+	void __user *buf;
+	size_t bufsz;
+	unsigned long mem;	/* User space sees this as a (void *) ... */
+	size_t memsz;
+};
+
+#ifdef CONFIG_COMPAT
+struct compat_kexec_segment {
+	compat_uptr_t buf;
+	compat_size_t bufsz;
+	compat_ulong_t mem;	/* User space sees this as a (void *) ... */
+	compat_size_t memsz;
+};
+#endif
+
+struct kimage {
+	kimage_entry_t head;
+	kimage_entry_t *entry;
+	kimage_entry_t *last_entry;
+
+	unsigned long destination;
+
+	unsigned long start;
+	struct page *control_code_page;
+
+	unsigned long nr_segments;
+	struct kexec_segment segment[KEXEC_SEGMENT_MAX];
+
+	struct list_head control_pages;
+	struct list_head dest_pages;
+	struct list_head unuseable_pages;
+
+	/* Address of next control page to allocate for crash kernels. */
+	unsigned long control_page;
+
+	/* Flags to indicate special processing */
+	unsigned int type : 1;
+#define KEXEC_TYPE_DEFAULT 0
+#define KEXEC_TYPE_CRASH   1
+};
+
+
+
+/* kexec interface functions */
+extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
+extern int machine_kexec_prepare(struct kimage *image);
+extern void machine_kexec_cleanup(struct kimage *image);
+extern asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags);
+#ifdef CONFIG_COMPAT
+extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
+	unsigned long flags);
+#endif
+extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern void crash_kexec(void);
+extern struct kimage *kexec_image;
+
+#define KEXEC_ON_CRASH  0x00000001
+#define KEXEC_ARCH_MASK 0xffff0000
+
+/* These values match the ELF architecture values.
+ * Unless there is a good reason that should continue to be the case.
+ */
+#define KEXEC_ARCH_DEFAULT ( 0 << 16)
+#define KEXEC_ARCH_386     ( 3 << 16)
+#define KEXEC_ARCH_X86_64  (62 << 16)
+#define KEXEC_ARCH_PPC     (20 << 16)
+#define KEXEC_ARCH_PPC64   (21 << 16)
+#define KEXEC_ARCH_IA_64   (50 << 16)
+
+#define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
+
+/* Location of a reserved region to hold the crash kernel.
+ */
+extern struct resource crashk_res;
+
+#else /* !CONFIG_KEXEC */
+static inline void crash_kexec(void) { }
+#endif /* CONFIG_KEXEC */
+#endif /* LINUX_KEXEC_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index d60fafc8bdc5..c5a05e16edb2 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -51,6 +51,9 @@ extern void machine_restart(char *cmd);
 extern void machine_halt(void);
 extern void machine_power_off(void);
 
+extern void machine_shutdown(void);
+extern void machine_crash_shutdown(void);
+
 #endif
 
 #endif /* _LINUX_REBOOT_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c39f6f72cbbc..7ba8f8f747aa 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -159,8 +159,9 @@ asmlinkage long sys_shutdown(int, int);
 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
 				void __user *arg);
 asmlinkage long sys_restart_syscall(void);
-asmlinkage long sys_kexec_load(void *entry, unsigned long nr_segments,
-			struct kexec_segment *segments, unsigned long flags);
+asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags);
 
 asmlinkage long sys_exit(int error_code);
 asmlinkage void sys_exit_group(int error_code);
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cfc8b0dea950 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..def9c73ec9a6
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1036 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
+#include <linux/ioport.h>
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/semaphore.h>
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+	.name  = "Crash kernel",
+	.start = 0,
+	.end   = 0,
+	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses.  On processors
+ * where you can disable the MMU this is trivial, and easy.  For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place.  This means I can only support memory whose
+ * physical address can fit in an unsigned long.  In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages.  As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it).  The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ *  - allocating a page table with the control code buffer identity
+ *    mapped, to simplify machine_kexec and make kexec_on_panic more
+ *    reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments)
+{
+	size_t segment_bytes;
+	struct kimage *image;
+	unsigned long i;
+	int result;
+
+	/* Allocate a controlling structure */
+	result = -ENOMEM;
+	image = kmalloc(sizeof(*image), GFP_KERNEL);
+	if (!image) {
+		goto out;
+	}
+	memset(image, 0, sizeof(*image));
+	image->head = 0;
+	image->entry = &image->head;
+	image->last_entry = &image->head;
+	image->control_page = ~0; /* By default this does not apply */
+	image->start = entry;
+	image->type = KEXEC_TYPE_DEFAULT;
+
+	/* Initialize the list of control pages */
+	INIT_LIST_HEAD(&image->control_pages);
+
+	/* Initialize the list of destination pages */
+	INIT_LIST_HEAD(&image->dest_pages);
+
+	/* Initialize the list of unuseable pages */
+	INIT_LIST_HEAD(&image->unuseable_pages);
+
+	/* Read in the segments */
+	image->nr_segments = nr_segments;
+	segment_bytes = nr_segments * sizeof(*segments);
+	result = copy_from_user(image->segment, segments, segment_bytes);
+	if (result)
+		goto out;
+
+	/*
+	 * Verify we have good destination addresses.  The caller is
+	 * responsible for making certain we don't attempt to load
+	 * the new image into invalid or reserved areas of RAM.  This
+	 * just verifies it is an address we can use.
+	 *
+	 * Since the kernel does everything in page size chunks ensure
+	 * the destination addreses are page aligned.  Too many
+	 * special cases crop of when we don't do this.  The most
+	 * insidious is getting overlapping destination addresses
+	 * simply because addresses are changed to page size
+	 * granularity.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+			goto out;
+		if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+			goto out;
+	}
+
+	/* Verify our destination addresses do not overlap.
+	 * If we alloed overlapping destination addresses
+	 * through very weird things can happen with no
+	 * easy explanation as one segment stops on another.
+	 */
+	result = -EINVAL;
+	for(i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		unsigned long j;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		for(j = 0; j < i; j++) {
+			unsigned long pstart, pend;
+			pstart = image->segment[j].mem;
+			pend   = pstart + image->segment[j].memsz;
+			/* Do the segments overlap ? */
+			if ((mend > pstart) && (mstart < pend))
+				goto out;
+		}
+	}
+
+	/* Ensure our buffer sizes are strictly less than
+	 * our memory sizes.  This should always be the case,
+	 * and it is easier to check up front than to be surprised
+	 * later on.
+	 */
+	result = -EINVAL;
+	for(i = 0; i < nr_segments; i++) {
+		if (image->segment[i].bufsz > image->segment[i].memsz)
+			goto out;
+	}
+
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+
+}
+
+static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments)
+{
+	int result;
+	struct kimage *image;
+
+	/* Allocate and initialize a controlling structure */
+	image = NULL;
+	result = do_kimage_alloc(&image, entry, nr_segments, segments);
+	if (result) {
+		goto out;
+	}
+	*rimage = image;
+
+	/*
+	 * Find a location for the control code buffer, and add it
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.
+	 */
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+		get_order(KEXEC_CONTROL_CODE_SIZE));
+	if (!image->control_code_page) {
+		printk(KERN_ERR "Could not allocate control_code_buffer\n");
+		goto out;
+	}
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment *segments)
+{
+	int result;
+	struct kimage *image;
+	unsigned long i;
+
+	image = NULL;
+	/* Verify we have a valid entry point */
+	if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
+		result = -EADDRNOTAVAIL;
+		goto out;
+	}
+
+	/* Allocate and initialize a controlling structure */
+	result = do_kimage_alloc(&image, entry, nr_segments, segments);
+	if (result) {
+		goto out;
+	}
+
+	/* Enable the special crash kernel control page
+	 * allocation policy.
+	 */
+	image->control_page = crashk_res.start;
+	image->type = KEXEC_TYPE_CRASH;
+
+	/*
+	 * Verify we have good destination addresses.  Normally
+	 * the caller is responsible for making certain we don't
+	 * attempt to load the new image into invalid or reserved
+	 * areas of RAM.  But crash kernels are preloaded into a
+	 * reserved area of ram.  We must ensure the addresses
+	 * are in the reserved area otherwise preloading the
+	 * kernel could corrupt things.
+	 */
+	result = -EADDRNOTAVAIL;
+	for (i = 0; i < nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend = mstart + image->segment[i].memsz;
+		/* Ensure we are within the crash kernel limits */
+		if ((mstart < crashk_res.start) || (mend > crashk_res.end))
+			goto out;
+	}
+
+
+	/*
+	 * Find a location for the control code buffer, and add
+	 * the vector of segments so that it's pages will also be
+	 * counted as destination pages.
+	 */
+	result = -ENOMEM;
+	image->control_code_page = kimage_alloc_control_pages(image,
+		get_order(KEXEC_CONTROL_CODE_SIZE));
+	if (!image->control_code_page) {
+		printk(KERN_ERR "Could not allocate control_code_buffer\n");
+		goto out;
+	}
+
+	result = 0;
+ out:
+	if (result == 0) {
+		*rimage = image;
+	} else {
+		kfree(image);
+	}
+	return result;
+}
+
+static int kimage_is_destination_range(
+	struct kimage *image, unsigned long start, unsigned long end)
+{
+	unsigned long i;
+
+	for (i = 0; i < image->nr_segments; i++) {
+		unsigned long mstart, mend;
+		mstart = image->segment[i].mem;
+		mend   = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+{
+	struct page *pages;
+	pages = alloc_pages(gfp_mask, order);
+	if (pages) {
+		unsigned int count, i;
+		pages->mapping = NULL;
+		pages->private = order;
+		count = 1 << order;
+		for(i = 0; i < count; i++) {
+			SetPageReserved(pages + i);
+		}
+	}
+	return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+	unsigned int order, count, i;
+	order = page->private;
+	count = 1 << order;
+	for(i = 0; i < count; i++) {
+		ClearPageReserved(page + i);
+	}
+	__free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+	struct list_head *pos, *next;
+	list_for_each_safe(pos, next, list) {
+		struct page *page;
+
+		page = list_entry(pos, struct page, lru);
+		list_del(&page->lru);
+
+		kimage_free_pages(page);
+	}
+}
+
+static struct page *kimage_alloc_normal_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * At worst this runs in O(N) of the image size.
+	 */
+	struct list_head extra_pages;
+	struct page *pages;
+	unsigned int count;
+
+	count = 1 << order;
+	INIT_LIST_HEAD(&extra_pages);
+
+	/* Loop while I can allocate a page and the page allocated
+	 * is a destination page.
+	 */
+	do {
+		unsigned long pfn, epfn, addr, eaddr;
+		pages = kimage_alloc_pages(GFP_KERNEL, order);
+		if (!pages)
+			break;
+		pfn   = page_to_pfn(pages);
+		epfn  = pfn + count;
+		addr  = pfn << PAGE_SHIFT;
+		eaddr = epfn << PAGE_SHIFT;
+		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+			kimage_is_destination_range(image, addr, eaddr))
+		{
+			list_add(&pages->lru, &extra_pages);
+			pages = NULL;
+		}
+	} while(!pages);
+	if (pages) {
+		/* Remember the allocated page... */
+		list_add(&pages->lru, &image->control_pages);
+
+		/* Because the page is already in it's destination
+		 * location we will never allocate another page at
+		 * that address.  Therefore kimage_alloc_pages
+		 * will not return it (again) and we don't need
+		 * to give it an entry in image->segment[].
+		 */
+	}
+	/* Deal with the destination pages I have inadvertently allocated.
+	 *
+	 * Ideally I would convert multi-page allocations into single
+	 * page allocations, and add everyting to image->dest_pages.
+	 *
+	 * For now it is simpler to just free the pages.
+	 */
+	kimage_free_page_list(&extra_pages);
+	return pages;
+
+}
+
+static struct page *kimage_alloc_crash_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	/* Control pages are special, they are the intermediaries
+	 * that are needed while we copy the rest of the pages
+	 * to their final resting place.  As such they must
+	 * not conflict with either the destination addresses
+	 * or memory the kernel is already using.
+	 *
+	 * Control pages are also the only pags we must allocate
+	 * when loading a crash kernel.  All of the other pages
+	 * are specified by the segments and we just memcpy
+	 * into them directly.
+	 *
+	 * The only case where we really need more than one of
+	 * these are for architectures where we cannot disable
+	 * the MMU and must instead generate an identity mapped
+	 * page table for all of the memory.
+	 *
+	 * Given the low demand this implements a very simple
+	 * allocator that finds the first hole of the appropriate
+	 * size in the reserved memory region, and allocates all
+	 * of the memory up to and including the hole.
+	 */
+	unsigned long hole_start, hole_end, size;
+	struct page *pages;
+	pages = NULL;
+	size = (1 << order) << PAGE_SHIFT;
+	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+	hole_end   = hole_start + size - 1;
+	while(hole_end <= crashk_res.end) {
+		unsigned long i;
+		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
+			break;
+		}
+		if (hole_end > crashk_res.end) {
+			break;
+		}
+		/* See if I overlap any of the segments */
+		for(i = 0; i < image->nr_segments; i++) {
+			unsigned long mstart, mend;
+			mstart = image->segment[i].mem;
+			mend   = mstart + image->segment[i].memsz - 1;
+			if ((hole_end >= mstart) && (hole_start <= mend)) {
+				/* Advance the hole to the end of the segment */
+				hole_start = (mend + (size - 1)) & ~(size - 1);
+				hole_end   = hole_start + size - 1;
+				break;
+			}
+		}
+		/* If I don't overlap any segments I have found my hole! */
+		if (i == image->nr_segments) {
+			pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+			break;
+		}
+	}
+	if (pages) {
+		image->control_page = hole_end;
+	}
+	return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(
+	struct kimage *image, unsigned int order)
+{
+	struct page *pages = NULL;
+	switch(image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		pages = kimage_alloc_normal_control_pages(image, order);
+		break;
+	case KEXEC_TYPE_CRASH:
+		pages = kimage_alloc_crash_control_pages(image, order);
+		break;
+	}
+	return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+	if (*image->entry != 0) {
+		image->entry++;
+	}
+	if (image->entry == image->last_entry) {
+		kimage_entry_t *ind_page;
+		struct page *page;
+		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+		if (!page) {
+			return -ENOMEM;
+		}
+		ind_page = page_address(page);
+		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+		image->entry = ind_page;
+		image->last_entry =
+			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+	}
+	*image->entry = entry;
+	image->entry++;
+	*image->entry = 0;
+	return 0;
+}
+
+static int kimage_set_destination(
+	struct kimage *image, unsigned long destination)
+{
+	int result;
+
+	destination &= PAGE_MASK;
+	result = kimage_add_entry(image, destination | IND_DESTINATION);
+	if (result == 0) {
+		image->destination = destination;
+	}
+	return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+	int result;
+
+	page &= PAGE_MASK;
+	result = kimage_add_entry(image, page | IND_SOURCE);
+	if (result == 0) {
+		image->destination += PAGE_SIZE;
+	}
+	return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+	/* Walk through and free any extra destination pages I may have */
+	kimage_free_page_list(&image->dest_pages);
+
+	/* Walk through and free any unuseable pages I have cached */
+	kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+	if (*image->entry != 0) {
+		image->entry++;
+	}
+	*image->entry = IND_DONE;
+	return 0;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+	for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+		ptr = (entry & IND_INDIRECTION)? \
+			phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+	struct page *page;
+
+	page = pfn_to_page(entry >> PAGE_SHIFT);
+	kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+	kimage_entry_t *ptr, entry;
+	kimage_entry_t ind = 0;
+
+	if (!image)
+		return;
+	kimage_free_extra_pages(image);
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_INDIRECTION) {
+			/* Free the previous indirection page */
+			if (ind & IND_INDIRECTION) {
+				kimage_free_entry(ind);
+			}
+			/* Save this indirection page until we are
+			 * done with it.
+			 */
+			ind = entry;
+		}
+		else if (entry & IND_SOURCE) {
+			kimage_free_entry(entry);
+		}
+	}
+	/* Free the final indirection page */
+	if (ind & IND_INDIRECTION) {
+		kimage_free_entry(ind);
+	}
+
+	/* Handle any machine specific cleanup */
+	machine_kexec_cleanup(image);
+
+	/* Free the kexec control pages... */
+	kimage_free_page_list(&image->control_pages);
+	kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+{
+	kimage_entry_t *ptr, entry;
+	unsigned long destination = 0;
+
+	for_each_kimage_entry(image, ptr, entry) {
+		if (entry & IND_DESTINATION) {
+			destination = entry & PAGE_MASK;
+		}
+		else if (entry & IND_SOURCE) {
+			if (page == destination) {
+				return ptr;
+			}
+			destination += PAGE_SIZE;
+		}
+	}
+	return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+{
+	/*
+	 * Here we implement safeguards to ensure that a source page
+	 * is not copied to its destination page before the data on
+	 * the destination page is no longer useful.
+	 *
+	 * To do this we maintain the invariant that a source page is
+	 * either its own destination page, or it is not a
+	 * destination page at all.
+	 *
+	 * That is slightly stronger than required, but the proof
+	 * that no problems will not occur is trivial, and the
+	 * implementation is simply to verify.
+	 *
+	 * When allocating all pages normally this algorithm will run
+	 * in O(N) time, but in the worst case it will run in O(N^2)
+	 * time.   If the runtime is a problem the data structures can
+	 * be fixed.
+	 */
+	struct page *page;
+	unsigned long addr;
+
+	/*
+	 * Walk through the list of destination pages, and see if I
+	 * have a match.
+	 */
+	list_for_each_entry(page, &image->dest_pages, lru) {
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+		if (addr == destination) {
+			list_del(&page->lru);
+			return page;
+		}
+	}
+	page = NULL;
+	while (1) {
+		kimage_entry_t *old;
+
+		/* Allocate a page, if we run out of memory give up */
+		page = kimage_alloc_pages(gfp_mask, 0);
+		if (!page) {
+			return 0;
+		}
+		/* If the page cannot be used file it away */
+		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+			list_add(&page->lru, &image->unuseable_pages);
+			continue;
+		}
+		addr = page_to_pfn(page) << PAGE_SHIFT;
+
+		/* If it is the destination page we want use it */
+		if (addr == destination)
+			break;
+
+		/* If the page is not a destination page use it */
+		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+			break;
+
+		/*
+		 * I know that the page is someones destination page.
+		 * See if there is already a source page for this
+		 * destination page.  And if so swap the source pages.
+		 */
+		old = kimage_dst_used(image, addr);
+		if (old) {
+			/* If so move it */
+			unsigned long old_addr;
+			struct page *old_page;
+
+			old_addr = *old & PAGE_MASK;
+			old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+			copy_highpage(page, old_page);
+			*old = addr | (*old & ~PAGE_MASK);
+
+			/* The old page I have found cannot be a
+			 * destination page, so return it.
+			 */
+			addr = old_addr;
+			page = old_page;
+			break;
+		}
+		else {
+			/* Place the page on the destination list I
+			 * will use it later.
+			 */
+			list_add(&page->lru, &image->dest_pages);
+		}
+	}
+	return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	unsigned long maddr;
+	unsigned long ubytes, mbytes;
+	int result;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+
+	result = kimage_set_destination(image, maddr);
+	if (result < 0) {
+		goto out;
+	}
+	while(mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
+		if (result < 0) {
+			goto out;
+		}
+		ptr = kmap(page);
+		/* Start with a clear page */
+		memset(ptr, 0, PAGE_SIZE);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+		if (mchunk > mbytes) {
+			mchunk = mbytes;
+		}
+		uchunk = mchunk;
+		if (uchunk > ubytes) {
+			uchunk = ubytes;
+		}
+		result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = (result < 0) ? result : -EIO;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		buf    += mchunk;
+		mbytes -= mchunk;
+	}
+ out:
+	return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	/* For crash dumps kernels we simply copy the data from
+	 * user space to it's destination.
+	 * We do things a page at a time for the sake of kmap.
+	 */
+	unsigned long maddr;
+	unsigned long ubytes, mbytes;
+	int result;
+	unsigned char *buf;
+
+	result = 0;
+	buf = segment->buf;
+	ubytes = segment->bufsz;
+	mbytes = segment->memsz;
+	maddr = segment->mem;
+	while(mbytes) {
+		struct page *page;
+		char *ptr;
+		size_t uchunk, mchunk;
+		page = pfn_to_page(maddr >> PAGE_SHIFT);
+		if (page == 0) {
+			result  = -ENOMEM;
+			goto out;
+		}
+		ptr = kmap(page);
+		ptr += maddr & ~PAGE_MASK;
+		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+		if (mchunk > mbytes) {
+			mchunk = mbytes;
+		}
+		uchunk = mchunk;
+		if (uchunk > ubytes) {
+			uchunk = ubytes;
+			/* Zero the trailing part of the page */
+			memset(ptr + uchunk, 0, mchunk - uchunk);
+		}
+		result = copy_from_user(ptr, buf, uchunk);
+		kunmap(page);
+		if (result) {
+			result = (result < 0) ? result : -EIO;
+			goto out;
+		}
+		ubytes -= uchunk;
+		maddr  += mchunk;
+		buf    += mchunk;
+		mbytes -= mchunk;
+	}
+ out:
+	return result;
+}
+
+static int kimage_load_segment(struct kimage *image,
+	struct kexec_segment *segment)
+{
+	int result = -ENOMEM;
+	switch(image->type) {
+	case KEXEC_TYPE_DEFAULT:
+		result = kimage_load_normal_segment(image, segment);
+		break;
+	case KEXEC_TYPE_CRASH:
+		result = kimage_load_crash_segment(image, segment);
+		break;
+	}
+	return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ *   address space, and very carefully places the data in the
+ *   allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ *   the devices to shut down.  Preventing on-going dmas, and placing
+ *   the devices in a consistent state so a later kernel can
+ *   reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ *   and the copies the image to it's final destination.  And
+ *   jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+static struct kimage *kexec_crash_image = NULL;
+/*
+ * A home grown binary mutex.
+ * Nothing can wait so this mutex is safe to use
+ * in interrupt context :)
+ */
+static int kexec_lock = 0;
+
+asmlinkage long sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct kexec_segment __user *segments,
+	unsigned long flags)
+{
+	struct kimage **dest_image, *image;
+	int locked;
+	int result;
+
+	/* We only trust the superuser with rebooting the system. */
+	if (!capable(CAP_SYS_BOOT))
+		return -EPERM;
+
+	/*
+	 * Verify we have a legal set of flags
+	 * This leaves us room for future extensions.
+	 */
+	if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+		return -EINVAL;
+
+	/* Verify we are on the appropriate architecture */
+	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+	{
+		return -EINVAL;
+	}
+
+	/* Put an artificial cap on the number
+	 * of segments passed to kexec_load.
+	 */
+	if (nr_segments > KEXEC_SEGMENT_MAX)
+		return -EINVAL;
+
+	image = NULL;
+	result = 0;
+
+	/* Because we write directly to the reserved memory
+	 * region when loading crash kernels we need a mutex here to
+	 * prevent multiple crash  kernels from attempting to load
+	 * simultaneously, and to prevent a crash kernel from loading
+	 * over the top of a in use crash kernel.
+	 *
+	 * KISS: always take the mutex.
+	 */
+	locked = xchg(&kexec_lock, 1);
+	if (locked) {
+		return -EBUSY;
+	}
+	dest_image = &kexec_image;
+	if (flags & KEXEC_ON_CRASH) {
+		dest_image = &kexec_crash_image;
+	}
+	if (nr_segments > 0) {
+		unsigned long i;
+		/* Loading another kernel to reboot into */
+		if ((flags & KEXEC_ON_CRASH) == 0) {
+			result = kimage_normal_alloc(&image, entry, nr_segments, segments);
+		}
+		/* Loading another kernel to switch to if this one crashes */
+		else if (flags & KEXEC_ON_CRASH) {
+			/* Free any current crash dump kernel before
+			 * we corrupt it.
+			 */
+			kimage_free(xchg(&kexec_crash_image, NULL));
+			result = kimage_crash_alloc(&image, entry, nr_segments, segments);
+		}
+		if (result) {
+			goto out;
+		}
+		result = machine_kexec_prepare(image);
+		if (result) {
+			goto out;
+		}
+		for(i = 0; i < nr_segments; i++) {
+			result = kimage_load_segment(image, &image->segment[i]);
+			if (result) {
+				goto out;
+			}
+		}
+		result = kimage_terminate(image);
+		if (result) {
+			goto out;
+		}
+	}
+	/* Install the new kernel, and  Uninstall the old */
+	image = xchg(dest_image, image);
+
+ out:
+	xchg(&kexec_lock, 0); /* Release the mutex */
+	kimage_free(image);
+	return result;
+}
+
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_kexec_load(unsigned long entry,
+	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
+	unsigned long flags)
+{
+	struct compat_kexec_segment in;
+	struct kexec_segment out, __user *ksegments;
+	unsigned long i, result;
+
+	/* Don't allow clients that don't understand the native
+	 * architecture to do anything.
+	 */
+	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
+		return -EINVAL;
+	}
+
+	if (nr_segments > KEXEC_SEGMENT_MAX) {
+		return -EINVAL;
+	}
+
+	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+	for (i=0; i < nr_segments; i++) {
+		result = copy_from_user(&in, &segments[i], sizeof(in));
+		if (result) {
+			return -EFAULT;
+		}
+
+		out.buf   = compat_ptr(in.buf);
+		out.bufsz = in.bufsz;
+		out.mem   = in.mem;
+		out.memsz = in.memsz;
+
+		result = copy_to_user(&ksegments[i], &out, sizeof(out));
+		if (result) {
+			return -EFAULT;
+		}
+	}
+
+	return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+
+void crash_kexec(void)
+{
+	struct kimage *image;
+	int locked;
+
+
+	/* Take the kexec_lock here to prevent sys_kexec_load
+	 * running on one cpu from replacing the crash kernel
+	 * we are using after a panic on a different cpu.
+	 *
+	 * If the crash kernel was not located in a fixed area
+	 * of memory the xchg(&kexec_crash_image) would be
+	 * sufficient.  But since I reuse the memory...
+	 */
+	locked = xchg(&kexec_lock, 1);
+	if (!locked) {
+		image = xchg(&kexec_crash_image, NULL);
+		if (image) {
+			machine_crash_shutdown();
+			machine_kexec(image);
+		}
+		xchg(&kexec_lock, 0);
+	}
+}
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..66f43d33cd80 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
 #include <linux/sysrq.h>
 #include <linux/interrupt.h>
 #include <linux/nmi.h>
+#include <linux/kexec.h>
 
 int panic_timeout;
 int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
         unsigned long caller = (unsigned long) __builtin_return_address(0);
 #endif
 
+	/*
+	 * It's possible to come here directly from a panic-assertion and not
+	 * have preempt disabled. Some functions called from here want
+	 * preempt to be disabled. No point enabling it later though...
+	 */
+	preempt_disable();
+
 	bust_spinlocks(1);
 	va_start(args, fmt);
 	vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
 	bust_spinlocks(0);
 
+	/*
+	 * If we have crashed and we have a crash kernel loaded let it handle
+	 * everything else.
+	 * Do we want to call this before we try to display a message?
+	 */
+	crash_kexec();
+
 #ifdef CONFIG_SMP
+	/*
+	 * Note smp_send_stop is the usual smp shutdown function, which
+	 * unfortunately means it may not be hardened to work in a panic
+	 * situation.
+	 */
 	smp_send_stop();
 #endif
 
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	if (!panic_blink)
 		panic_blink = no_blink;
 
-	if (panic_timeout > 0)
-	{
+	if (panic_timeout > 0) {
 		/*
 	 	 * Delay timeout seconds before rebooting the machine. 
 		 * We can't use the "normal" timers since we just panicked..
diff --git a/kernel/sys.c b/kernel/sys.c
index dac10161ca23..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
 #include <linux/workqueue.h>
 #include <linux/device.h>
 #include <linux/key.h>
@@ -439,6 +441,24 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		machine_restart(buffer);
 		break;
 
+#ifdef CONFIG_KEXEC
+	case LINUX_REBOOT_CMD_KEXEC:
+	{
+		struct kimage *image;
+		image = xchg(&kexec_image, 0);
+		if (!image) {
+			unlock_kernel();
+			return -EINVAL;
+		}
+		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+		system_state = SYSTEM_RESTART;
+		device_shutdown();
+		printk(KERN_EMERG "Starting new kernel\n");
+		machine_shutdown();
+		machine_kexec(image);
+		break;
+	}
+#endif
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 6f15bea7d1a8..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
 cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
 cond_syscall(sys_init_module);
 cond_syscall(sys_delete_module);
 cond_syscall(sys_socketpair);
-- 
cgit v1.2.3-59-g8ed1b


From cf13f0eaffa31bf6a145c53c589654b11c72ddc7 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sat, 25 Jun 2005 14:58:11 -0700
Subject: [PATCH] kexec: s390 support

Add kexec support for s390 architecture.

From: Milton Miller <miltonm@bga.com>

- Fix passing of first argument to relocate_kernel assembly.
- Fix Kconfig description.
- Remove wrong comment and comments that describe obvious things.
- Allow only KEXEC_TYPE_DEFAULT as image type -> dump not supported.

Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig                    |  8 +++
 arch/s390/kernel/Makefile            | 10 ++++
 arch/s390/kernel/compat_wrapper.S    |  8 +++
 arch/s390/kernel/crash.c             | 17 +++++++
 arch/s390/kernel/machine_kexec.c     | 98 ++++++++++++++++++++++++++++++++++++
 arch/s390/kernel/relocate_kernel.S   | 81 +++++++++++++++++++++++++++++
 arch/s390/kernel/relocate_kernel64.S | 82 ++++++++++++++++++++++++++++++
 arch/s390/kernel/syscalls.S          |  2 +-
 include/asm-s390/kexec.h             | 42 ++++++++++++++++
 include/asm-s390/unistd.h            |  2 +-
 include/linux/kexec.h                |  1 +
 11 files changed, 349 insertions(+), 2 deletions(-)
 create mode 100644 arch/s390/kernel/crash.c
 create mode 100644 arch/s390/kernel/machine_kexec.c
 create mode 100644 arch/s390/kernel/relocate_kernel.S
 create mode 100644 arch/s390/kernel/relocate_kernel64.S
 create mode 100644 include/asm-s390/kexec.h

(limited to 'include/linux')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 32696c1d9280..6600ee87f896 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -455,6 +455,14 @@ config NO_IDLE_HZ_INIT
 	  The HZ timer is switched off in idle by default. That means the
 	  HZ timer is already disabled at boot time.
 
+config KEXEC
+	bool "kexec system call (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  kexec is a system call that implements the ability to shutdown your
+	  current kernel, and to start another kernel.  It is like a reboot
+	  but is independent of hardware/microcode support.
+
 endmenu
 
 config PCMCIA
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index b41e0e199a7c..ab1e49d2e518 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -25,6 +25,16 @@ obj-$(CONFIG_ARCH_S390X)	+= entry64.o reipl64.o
 
 obj-$(CONFIG_VIRT_TIMER)	+= vtime.o
 
+# Kexec part
+S390_KEXEC_OBJS := machine_kexec.o crash.o
+ifeq ($(CONFIG_ARCH_S390X),y)
+S390_KEXEC_OBJS += relocate_kernel64.o
+else
+S390_KEXEC_OBJS += relocate_kernel.o
+endif
+obj-$(CONFIG_KEXEC) += $(S390_KEXEC_OBJS)
+
+
 #
 # This is just to get the dependencies...
 #
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 7a607b1d0380..bf529739c8ab 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1441,3 +1441,11 @@ compat_sys_waitid_wrapper:
 	lgfr	%r5,%r5			# int
 	llgtr	%r6,%r6			# struct rusage_emu31 *
 	jg	compat_sys_waitid
+
+	.globl	compat_sys_kexec_load_wrapper
+compat_sys_kexec_load_wrapper:
+	llgfr	%r2,%r2			# unsigned long
+	llgfr	%r3,%r3			# unsigned long
+	llgtr	%r4,%r4			# struct kexec_segment *
+	llgfr	%r5,%r5			# unsigned long
+	jg	compat_sys_kexec_load
diff --git a/arch/s390/kernel/crash.c b/arch/s390/kernel/crash.c
new file mode 100644
index 000000000000..db38283c1f27
--- /dev/null
+++ b/arch/s390/kernel/crash.c
@@ -0,0 +1,17 @@
+/*
+ * arch/s390/kernel/crash.c
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
+ *
+ */
+
+#include <linux/threads.h>
+#include <linux/kexec.h>
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(void)
+{
+}
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
new file mode 100644
index 000000000000..7a94db76df46
--- /dev/null
+++ b/arch/s390/kernel/machine_kexec.c
@@ -0,0 +1,98 @@
+/*
+ * arch/s390/kernel/machine_kexec.c
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Author(s): Rolf Adelsberger <adelsberger@de.ibm.com>
+ *
+ */
+
+/*
+ * s390_machine_kexec.c - handle the transition of Linux booting another kernel
+ * on the S390 architecture.
+ */
+
+#include <asm/cio.h>
+#include <asm/setup.h>
+#include <linux/device.h>
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/system.h>
+
+static void kexec_halt_all_cpus(void *);
+
+typedef void (*relocate_kernel_t) (kimage_entry_t *, unsigned long);
+
+const extern unsigned char relocate_kernel[];
+const extern unsigned long long relocate_kernel_len;
+
+int
+machine_kexec_prepare(struct kimage *image)
+{
+	unsigned long reboot_code_buffer;
+
+	/* We don't support anything but the default image type for now. */
+	if (image->type != KEXEC_TYPE_DEFAULT)
+		return -EINVAL;
+
+	/* Get the destination where the assembler code should be copied to.*/
+	reboot_code_buffer = page_to_pfn(image->control_code_page)<<PAGE_SHIFT;
+
+	/* Then copy it */
+	memcpy((void *) reboot_code_buffer, relocate_kernel,
+	       relocate_kernel_len);
+	return 0;
+}
+
+void
+machine_kexec_cleanup(struct kimage *image)
+{
+}
+
+void
+machine_shutdown(void)
+{
+	printk(KERN_INFO "kexec: machine_shutdown called\n");
+}
+
+NORET_TYPE void
+machine_kexec(struct kimage *image)
+{
+	clear_all_subchannels();
+
+	/* Disable lowcore protection */
+	ctl_clear_bit(0,28);
+
+	on_each_cpu(kexec_halt_all_cpus, image, 0, 0);
+	for(;;);
+}
+
+static void
+kexec_halt_all_cpus(void *kernel_image)
+{
+	static atomic_t cpuid = ATOMIC_INIT(-1);
+	int cpu;
+	struct kimage *image;
+	relocate_kernel_t data_mover;
+
+	if (atomic_compare_and_swap(-1, smp_processor_id(), &cpuid))
+		signal_processor(smp_processor_id(), sigp_stop);
+
+	/* Wait for all other cpus to enter stopped state */
+	for_each_online_cpu(cpu) {
+		if (cpu == smp_processor_id())
+			continue;
+		while(!smp_cpu_not_running(cpu))
+			cpu_relax();
+	}
+
+	image = (struct kimage *) kernel_image;
+	data_mover = (relocate_kernel_t)
+		(page_to_pfn(image->control_code_page) << PAGE_SHIFT);
+
+	/* Call the moving routine */
+	(*data_mover) (&image->head, image->start);
+}
diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d5e4a62fbb79
--- /dev/null
+++ b/arch/s390/kernel/relocate_kernel.S
@@ -0,0 +1,81 @@
+/*
+ * arch/s390/kernel/relocate_kernel.S
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Author(s): Rolf Adelsberger <adelsberger@de.ibm.com>
+ *
+ */
+
+/*
+ * moves the new kernel to its destination...
+ * %r2 = pointer to first kimage_entry_t
+ * %r3 = start address - where to jump to after the job is done...
+ *
+ * %r5 will be used as temp. storage
+ * %r6 holds the destination address
+ * %r7 = PAGE_SIZE
+ * %r8 holds the source address
+ * %r9 = PAGE_SIZE
+ * %r10 is a page mask
+ */
+
+	.text
+	.globl		relocate_kernel
+	relocate_kernel:
+		basr	%r13,0		#base address
+	.base:
+		spx	zero64-.base(%r13)	#absolute addressing mode
+		stnsm	sys_msk-.base(%r13),0xf8	#disable DAT and IRQ (external)
+		lhi	%r10,-1		#preparing the mask
+		sll	%r10,12		#shift it such that it becomes 0xf000
+	.top:
+		lhi	%r7,4096	#load PAGE_SIZE in r7
+		lhi	%r9,4096	#load PAGE_SIZE in r9
+		l	%r5,0(%r2)	#read another word for indirection page
+		ahi	%r2,4		#increment pointer
+		tml	%r5,0x1		#is it a destination page?
+		je	.indir_check	#NO, goto "indir_check"
+		lr	%r6,%r5		#r6 = r5
+		nr	%r6,%r10	#mask it out and...
+		j	.top		#...next iteration
+	.indir_check:
+		tml	%r5,0x2		#is it a indirection page?
+		je	.done_test	#NO, goto "done_test"
+		nr	%r5,%r10	#YES, mask out,
+		lr	%r2,%r5		#move it into the right register,
+		j	.top		#and read next...
+	.done_test:
+		tml	%r5,0x4		#is it the done indicator?
+		je	.source_test	#NO! Well, then it should be the source indicator...
+		j	.done		#ok, lets finish it here...
+	.source_test:
+		tml	%r5,0x8		#it should be a source indicator...
+		je	.top		#NO, ignore it...
+		lr	%r8,%r5		#r8 = r5
+		nr	%r8,%r10	#masking
+	0:	mvcle	%r6,%r8,0x0	#copy PAGE_SIZE bytes from r8 to r6 - pad with 0
+		jo	0b
+		j	.top
+	.done:
+		sr	%r0,%r0		#clear register r0
+		la	%r4,load_psw-.base(%r13)	#load psw-address into the register
+		o	%r3,4(%r4)	#or load address into psw
+		st	%r3,4(%r4)
+		mvc	0(8,%r0),0(%r4)	#copy psw to absolute address 0
+		sr	%r1,%r1		#clear %r1
+		sr	%r2,%r2		#clear %r2
+		sigp	%r1,%r2,0x12	#set cpuid to zero
+		lpsw	0		#hopefully start new kernel...
+
+		.align	8
+	zero64:
+		.quad	0
+	load_psw:
+		.long	0x00080000,0x80000000
+	sys_msk:
+		.quad	0
+	relocate_kernel_end:
+	.globl	relocate_kernel_len
+	relocate_kernel_len:
+		.quad	relocate_kernel_end - relocate_kernel
diff --git a/arch/s390/kernel/relocate_kernel64.S b/arch/s390/kernel/relocate_kernel64.S
new file mode 100644
index 000000000000..96290cc4eb3c
--- /dev/null
+++ b/arch/s390/kernel/relocate_kernel64.S
@@ -0,0 +1,82 @@
+/*
+ * arch/s390/kernel/relocate_kernel64.S
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Author(s): Rolf Adelsberger <adelsberger@de.ibm.com>
+ *
+ */
+
+/*
+ * moves the new kernel to its destination...
+ * %r2 = pointer to first kimage_entry_t
+ * %r3 = start address - where to jump to after the job is done...
+ *
+ * %r5 will be used as temp. storage
+ * %r6 holds the destination address
+ * %r7 = PAGE_SIZE
+ * %r8 holds the source address
+ * %r9 = PAGE_SIZE
+ *
+ * 0xf000 is a page_mask
+ */
+
+	.text
+	.globl		relocate_kernel
+	relocate_kernel:
+		basr	%r13,0		#base address
+	.base:
+		spx	zero64-.base(%r13)	#absolute addressing mode
+		stnsm	sys_msk-.base(%r13),0xf8	#disable DAT and IRQ (external)
+	.top:
+		lghi	%r7,4096	#load PAGE_SIZE in r7
+		lghi	%r9,4096	#load PAGE_SIZE in r9
+		lg	%r5,0(%r2)	#read another word for indirection page
+		aghi	%r2,8		#increment pointer
+		tml	%r5,0x1		#is it a destination page?
+		je	.indir_check	#NO, goto "indir_check"
+		lgr	%r6,%r5		#r6 = r5
+		nill	%r6,0xf000	#mask it out and...
+		j	.top		#...next iteration
+	.indir_check:
+		tml     %r5,0x2		#is it a indirection page?
+		je      .done_test	#NO, goto "done_test"
+		nill    %r5,0xf000	#YES, mask out,
+		lgr     %r2,%r5		#move it into the right register,
+		j       .top		#and read next...
+	.done_test:
+		tml     %r5,0x4		#is it the done indicator?
+		je      .source_test	#NO! Well, then it should be the source indicator...
+		j       .done		#ok, lets finish it here...
+	.source_test:
+		tml     %r5,0x8		#it should be a source indicator...
+		je      .top		#NO, ignore it...
+		lgr     %r8,%r5		#r8 = r5
+		nill    %r8,0xf000	#masking
+	0:	mvcle   %r6,%r8,0x0	#copy PAGE_SIZE bytes from r8 to r6 - pad with 0
+		jo	0b
+		j       .top
+	.done:
+		sgr     %r0,%r0		#clear register r0
+		la      %r4,load_psw-.base(%r13)	#load psw-address into the register
+		o	%r3,4(%r4)	#or load address into psw
+		st	%r3,4(%r4)
+		mvc     0(8,%r0),0(%r4)	#copy psw to absolute address 0
+		sam31			#31 bit mode
+		sr      %r1,%r1		#erase register r1
+		sr      %r2,%r2		#erase register r2
+		sigp    %r1,%r2,0x12	#set cpuid to zero
+		lpsw    0		#hopefully start new kernel...
+
+	        .align	8
+	zero64:
+		.quad	0
+	load_psw:
+		.long	0x00080000,0x80000000
+	sys_msk:
+		.quad	0
+	relocate_kernel_end:
+	.globl	relocate_kernel_len
+	relocate_kernel_len:
+		.quad	relocate_kernel_end - relocate_kernel
+
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 515938628f82..a8668afb5f87 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -285,7 +285,7 @@ SYSCALL(sys_mq_timedsend,sys_mq_timedsend,compat_sys_mq_timedsend_wrapper)
 SYSCALL(sys_mq_timedreceive,sys_mq_timedreceive,compat_sys_mq_timedreceive_wrapper)
 SYSCALL(sys_mq_notify,sys_mq_notify,compat_sys_mq_notify_wrapper) /* 275 */
 SYSCALL(sys_mq_getsetattr,sys_mq_getsetattr,compat_sys_mq_getsetattr_wrapper)
-NI_SYSCALL							/* reserved for kexec */
+SYSCALL(sys_kexec_load,sys_kexec_load,compat_sys_kexec_load_wrapper)
 SYSCALL(sys_add_key,sys_add_key,compat_sys_add_key_wrapper)
 SYSCALL(sys_request_key,sys_request_key,compat_sys_request_key_wrapper)
 SYSCALL(sys_keyctl,sys_keyctl,compat_sys_keyctl)		/* 280 */
diff --git a/include/asm-s390/kexec.h b/include/asm-s390/kexec.h
new file mode 100644
index 000000000000..54cf7d9f251c
--- /dev/null
+++ b/include/asm-s390/kexec.h
@@ -0,0 +1,42 @@
+/*
+ * include/asm-s390/kexec.h
+ *
+ * (C) Copyright IBM Corp. 2005
+ *
+ * Author(s): Rolf Adelsberger <adelsberger@de.ibm.com>
+ *
+ */
+
+#ifndef _S390_KEXEC_H
+#define _S390_KEXEC_H
+
+#include <asm/page.h>
+#include <asm/processor.h>
+/*
+ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
+ * I.e. Maximum page that is mapped directly into kernel memory,
+ * and kmap is not required.
+ */
+
+/* Maximum physical address we can use pages from */
+#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can reach in physical address mode */
+#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
+
+/* Maximum address we can use for the control pages */
+/* Not more than 2GB */
+#define KEXEC_CONTROL_MEMORY_LIMIT (1<<31)
+
+/* Allocate one page for the pdp and the second for the code */
+#define KEXEC_CONTROL_CODE_SIZE 4096
+
+/* The native architecture */
+#define KEXEC_ARCH KEXEC_ARCH_S390
+
+#define MAX_NOTE_BYTES 1024
+typedef u32 note_buf_t[MAX_NOTE_BYTES/4];
+
+extern note_buf_t crash_notes[];
+
+#endif /*_S390_KEXEC_H */
diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h
index f1a204f7c0f0..363db45f8d07 100644
--- a/include/asm-s390/unistd.h
+++ b/include/asm-s390/unistd.h
@@ -269,7 +269,7 @@
 #define __NR_mq_timedreceive	274
 #define __NR_mq_notify		275
 #define __NR_mq_getsetattr	276
-/* Number 277 is reserved for new sys_kexec_load */
+#define __NR_kexec_load		277
 #define __NR_add_key		278
 #define __NR_request_key	279
 #define __NR_keyctl		280
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index e3fc35f4e35f..0653a27c3d72 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -114,6 +114,7 @@ extern struct kimage *kexec_image;
 #define KEXEC_ARCH_PPC     (20 << 16)
 #define KEXEC_ARCH_PPC64   (21 << 16)
 #define KEXEC_ARCH_IA_64   (50 << 16)
+#define KEXEC_ARCH_S390    (22 << 16)
 
 #define KEXEC_FLAGS    (KEXEC_ON_CRASH)  /* List of defined/legal kexec flags */
 
-- 
cgit v1.2.3-59-g8ed1b


From 92aa63a5a1bf2e7b0c79e6716d24b76dbbdcf951 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:18 -0700
Subject: [PATCH] kdump: Retrieve saved max pfn

This patch retrieves the max_pfn being used by previous kernel and stores it
in a safe location (saved_max_pfn) before it is overwritten due to user
defined memory map.  This pfn is used to make sure that user does not try to
read the physical memory beyond saved_max_pfn.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c | 12 ++++++++++++
 include/linux/bootmem.h  |  4 ++++
 mm/bootmem.c             |  8 ++++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index d88ebdfa6ccd..8d58a053e12e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -59,6 +59,9 @@
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
+/* Forward Declaration. */
+void __init find_max_pfn(void);
+
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
    address, and must not be in the .bss segment! */
@@ -736,6 +739,15 @@ static void __init parse_cmdline_early (char ** cmdline_p)
 			if (to != command_line)
 				to--;
 			if (!memcmp(from+7, "exactmap", 8)) {
+#ifdef CONFIG_CRASH_DUMP
+				/* If we are doing a crash dump, we
+				 * still need to know the real mem
+				 * size before original memory map is
+				 * reset.
+				 */
+				find_max_pfn();
+				saved_max_pfn = max_pfn;
+#endif
 				from += 8+7;
 				e820.nr_map = 0;
 				userdef = 1;
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 500f451ce0c0..82bd8842d11c 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -22,6 +22,10 @@ extern unsigned long min_low_pfn;
  */
 extern unsigned long max_pfn;
 
+#ifdef CONFIG_CRASH_DUMP
+extern unsigned long saved_max_pfn;
+#endif
+
 /*
  * node_bootmem_map is a map pointer - the bits represent all physical 
  * memory pages (including holes) on the node.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f82f7aebbee3..45275f1f8947 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,14 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
 				 * dma_get_required_mask(), which uses
 				 * it, can be an inline function */
 
+#ifdef CONFIG_CRASH_DUMP
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+#endif
+
 /* return the number of _pages_ that will be allocated for the boot bitmap */
 unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 60e64d46a58236e3c718074372cab6a5b56a3b15 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:19 -0700
Subject: [PATCH] kdump: Routines for copying dump pages

This patch provides the interfaces necessary to read the dump contents,
treating it as a high memory device.

Signed off by Hariprasad Nellitheertha <hari@in.ibm.com>
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/highmem.c     | 18 +++++++++++++++++
 include/asm-i386/highmem.h |  1 +
 include/linux/crash_dump.h | 13 ++++++++++++
 include/linux/highmem.h    |  1 +
 kernel/Makefile            |  1 +
 kernel/crash_dump.c        | 49 ++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 83 insertions(+)
 create mode 100644 include/linux/crash_dump.h
 create mode 100644 kernel/crash_dump.c

(limited to 'include/linux')

diff --git a/arch/i386/mm/highmem.c b/arch/i386/mm/highmem.c
index 4b7aaf99d7ea..b6eb4dcb8777 100644
--- a/arch/i386/mm/highmem.c
+++ b/arch/i386/mm/highmem.c
@@ -75,6 +75,24 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 	preempt_check_resched();
 }
 
+/* This is the same as kmap_atomic() but can map memory that doesn't
+ * have a struct page associated with it.
+ */
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+{
+	enum fixed_addresses idx;
+	unsigned long vaddr;
+
+	inc_preempt_count();
+
+	idx = type + KM_TYPE_NR*smp_processor_id();
+	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+	set_pte(kmap_pte-idx, pfn_pte(pfn, kmap_prot));
+	__flush_tlb_one(vaddr);
+
+	return (void*) vaddr;
+}
+
 struct page *kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
diff --git a/include/asm-i386/highmem.h b/include/asm-i386/highmem.h
index 1df42bf347df..0fd331306b60 100644
--- a/include/asm-i386/highmem.h
+++ b/include/asm-i386/highmem.h
@@ -70,6 +70,7 @@ void *kmap(struct page *page);
 void kunmap(struct page *page);
 void *kmap_atomic(struct page *page, enum km_type type);
 void kunmap_atomic(void *kvaddr, enum km_type type);
+void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
 struct page *kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
new file mode 100644
index 000000000000..7d983b817429
--- /dev/null
+++ b/include/linux/crash_dump.h
@@ -0,0 +1,13 @@
+#ifndef LINUX_CRASH_DUMP_H
+#define LINUX_CRASH_DUMP_H
+
+#ifdef CONFIG_CRASH_DUMP
+#include <linux/kexec.h>
+#include <linux/smp_lock.h>
+#include <linux/device.h>
+#include <linux/proc_fs.h>
+
+extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
+						unsigned long, int);
+#endif /* CONFIG_CRASH_DUMP */
+#endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 2a7e6c65c882..6bece9280eb7 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -28,6 +28,7 @@ static inline void *kmap(struct page *page)
 
 #define kmap_atomic(page, idx)		page_address(page)
 #define kunmap_atomic(addr, idx)	do { } while (0)
+#define kmap_atomic_pfn(pfn, idx)	page_address(pfn_to_page(pfn))
 #define kmap_atomic_to_page(ptr)	virt_to_page(ptr)
 
 #endif /* CONFIG_HIGHMEM */
diff --git a/kernel/Makefile b/kernel/Makefile
index cfc8b0dea950..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5a1e6d5d203e
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,49 @@
+/*
+ *	kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+/*
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+				size_t csize, unsigned long offset, int userbuf)
+{
+	void *page, *vaddr;
+
+	if (!csize)
+		return 0;
+
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+	copy_page(page, vaddr);
+	kunmap_atomic(vaddr, KM_PTE0);
+
+	if (userbuf) {
+		if (copy_to_user(buf, (page + offset), csize)) {
+			kfree(page);
+			return -EFAULT;
+		}
+	} else {
+		memcpy(buf, (page + offset), csize);
+	}
+
+	kfree(page);
+	return csize;
+}
-- 
cgit v1.2.3-59-g8ed1b


From 2030eae52b416a9a9f0ffda74c982b7f1e19496d Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:20 -0700
Subject: [PATCH] Retrieve elfcorehdr address from command line

This patch adds support for retrieving the address of elf core header if one
is passed in command line.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt | 4 ++++
 arch/i386/kernel/setup.c            | 8 ++++++++
 include/linux/crash_dump.h          | 1 +
 kernel/crash_dump.c                 | 3 +++
 4 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 560ff5ae3fd9..f44bb5567c5b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -451,6 +451,10 @@ running once the system is up.
 			Format: {"as"|"cfq"|"deadline"|"noop"}
 			See Documentation/block/as-iosched.txt
 			and Documentation/block/deadline-iosched.txt for details.
+	elfcorehdr=	[IA-32]
+			Specifies physical address of start of kernel core image
+			elf header.
+			See Documentation/kdump.txt for details.
 
 	enforcing	[SELINUX] Set initial enforcing status.
 			Format: {"0" | "1"}
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 8d58a053e12e..7306353c520e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -44,6 +44,7 @@
 #include <linux/edd.h>
 #include <linux/nodemask.h>
 #include <linux/kexec.h>
+#include <linux/crash_dump.h>
 
 #include <video/edid.h>
 
@@ -881,6 +882,13 @@ static void __init parse_cmdline_early (char ** cmdline_p)
 			}
 		}
 #endif
+#ifdef CONFIG_CRASH_DUMP
+		/* elfcorehdr= specifies the location of elf core header
+		 * stored by the crashed kernel.
+		 */
+		else if (!memcmp(from, "elfcorehdr=", 11))
+			elfcorehdr_addr = memparse(from+11, &from);
+#endif
 
 		/*
 		 * highmem=size forces highmem to be exactly 'size' bytes.
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 7d983b817429..3f25fd1eaa4b 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -7,6 +7,7 @@
 #include <linux/device.h>
 #include <linux/proc_fs.h>
 
+extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
 #endif /* CONFIG_CRASH_DUMP */
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5a1e6d5d203e..10b966c3744c 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -15,6 +15,9 @@
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr;
+
 /*
  * Copy a page from "oldmem". For this page, there is no pte mapped
  * in the current kernel. We stitch up a pte, similar to kmap_atomic.
-- 
cgit v1.2.3-59-g8ed1b


From 666bfddbe8b8fd4fd44617d6c55193d5ac7edb29 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:21 -0700
Subject: [PATCH] kdump: Access dump file in elf format (/proc/vmcore)

From: "Vivek Goyal" <vgoyal@in.ibm.com>

o Support for /proc/vmcore interface. This interface exports elf core image
  either in ELF32 or ELF64 format, depending on the format in which elf headers
  have been stored by crashed kernel.
o Added support for CONFIG_VMCORE config option.
o Removed the dependency on /proc/kcore.

From: "Eric W. Biederman" <ebiederm@xmission.com>

This patch has been refactored to more closely match the prevailing style in
the affected files.  And to clearly indicate the dependency between
/proc/kcore and proc/vmcore.c

From: Hariprasad Nellitheertha <hari@in.ibm.com>

This patch contains the code that provides an ELF format interface to the
previous kernel's memory post kexec reboot.

Signed off by Hariprasad Nellitheertha <hari@in.ibm.com>
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/Kconfig                 |   6 +
 fs/proc/Makefile           |   1 +
 fs/proc/proc_misc.c        |   6 +
 fs/proc/vmcore.c           | 451 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/crash_dump.h |   4 +
 include/linux/proc_fs.h    |   7 +
 kernel/crash_dump.c        |   2 +-
 7 files changed, 476 insertions(+), 1 deletion(-)
 create mode 100644 fs/proc/vmcore.c

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 8157f2e2d515..062177956239 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -734,6 +734,12 @@ config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
 
+config PROC_VMCORE
+        bool "/proc/vmcore support (EXPERIMENTAL)"
+        depends on PROC_FS && EMBEDDED && EXPERIMENTAL && CRASH_DUMP
+        help
+        Exports the dump image of crashed kernel in ELF format.
+
 config SYSFS
 	bool "sysfs file system support" if EMBEDDED
 	default y
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 738b9b602932..7431d7ba2d09 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -11,4 +11,5 @@ proc-y       += inode.o root.o base.o generic.o array.o \
 		kmsg.o proc_tty.o proc_misc.o
 
 proc-$(CONFIG_PROC_KCORE)	+= kcore.o
+proc-$(CONFIG_PROC_VMCORE)	+= vmcore.o
 proc-$(CONFIG_PROC_DEVICETREE)	+= proc_devtree.o
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 94b570ad037d..a3453555a94e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -44,6 +44,7 @@
 #include <linux/jiffies.h>
 #include <linux/sysrq.h>
 #include <linux/vmalloc.h>
+#include <linux/crash_dump.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/io.h>
@@ -618,6 +619,11 @@ void __init proc_misc_init(void)
 				(size_t)high_memory - PAGE_OFFSET + PAGE_SIZE;
 	}
 #endif
+#ifdef CONFIG_PROC_VMCORE
+	proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL);
+	if (proc_vmcore)
+		proc_vmcore->proc_fops = &proc_vmcore_operations;
+#endif
 #ifdef CONFIG_MAGIC_SYSRQ
 	entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL);
 	if (entry)
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
new file mode 100644
index 000000000000..8ad467855845
--- /dev/null
+++ b/fs/proc/vmcore.c
@@ -0,0 +1,451 @@
+/*
+ *	fs/proc/vmcore.c Interface for accessing the crash
+ * 				 dump from the system's previous life.
+ * 	Heavily borrowed from fs/proc/kcore.c
+ *	Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *	Copyright (C) IBM Corporation, 2004. All rights reserved
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/proc_fs.h>
+#include <linux/highmem.h>
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/crash_dump.h>
+#include <linux/list.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+/* List representing chunks of contiguous memory areas and their offsets in
+ * vmcore file.
+ */
+static LIST_HEAD(vmcore_list);
+
+/* Stores the pointer to the buffer containing kernel elf core headers. */
+static char *elfcorebuf;
+static size_t elfcorebuf_sz;
+
+/* Total size of vmcore file. */
+static u64 vmcore_size;
+
+struct proc_dir_entry *proc_vmcore = NULL;
+
+/* Reads a page from the oldmem device from given offset. */
+static ssize_t read_from_oldmem(char *buf, size_t count,
+			     loff_t *ppos, int userbuf)
+{
+	unsigned long pfn, offset;
+	size_t nr_bytes;
+	ssize_t read = 0, tmp;
+
+	if (!count)
+		return 0;
+
+	offset = (unsigned long)(*ppos % PAGE_SIZE);
+	pfn = (unsigned long)(*ppos / PAGE_SIZE);
+	if (pfn > saved_max_pfn)
+		return -EINVAL;
+
+	do {
+		if (count > (PAGE_SIZE - offset))
+			nr_bytes = PAGE_SIZE - offset;
+		else
+			nr_bytes = count;
+
+		tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf);
+		if (tmp < 0)
+			return tmp;
+		*ppos += nr_bytes;
+		count -= nr_bytes;
+		buf += nr_bytes;
+		read += nr_bytes;
+		++pfn;
+		offset = 0;
+	} while (count);
+
+	return read;
+}
+
+/* Maps vmcore file offset to respective physical address in memroy. */
+static u64 map_offset_to_paddr(loff_t offset, struct list_head *vc_list,
+					struct vmcore **m_ptr)
+{
+	struct vmcore *m;
+	u64 paddr;
+
+	list_for_each_entry(m, vc_list, list) {
+		u64 start, end;
+		start = m->offset;
+		end = m->offset + m->size - 1;
+		if (offset >= start && offset <= end) {
+			paddr = m->paddr + offset - start;
+			*m_ptr = m;
+			return paddr;
+		}
+	}
+	*m_ptr = NULL;
+	return 0;
+}
+
+/* Read from the ELF header and then the crash dump. On error, negative value is
+ * returned otherwise number of bytes read are returned.
+ */
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+				size_t buflen, loff_t *fpos)
+{
+	ssize_t acc = 0, tmp;
+	size_t tsz, nr_bytes;
+	u64 start;
+	struct vmcore *curr_m = NULL;
+
+	if (buflen == 0 || *fpos >= vmcore_size)
+		return 0;
+
+	/* trim buflen to not go beyond EOF */
+	if (buflen > vmcore_size - *fpos)
+		buflen = vmcore_size - *fpos;
+
+	/* Read ELF core header */
+	if (*fpos < elfcorebuf_sz) {
+		tsz = elfcorebuf_sz - *fpos;
+		if (buflen < tsz)
+			tsz = buflen;
+		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+			return -EFAULT;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+
+		/* leave now if filled buffer already */
+		if (buflen == 0)
+			return acc;
+	}
+
+	start = map_offset_to_paddr(*fpos, &vmcore_list, &curr_m);
+	if (!curr_m)
+        	return -EINVAL;
+	if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+		tsz = buflen;
+
+	/* Calculate left bytes in current memory segment. */
+	nr_bytes = (curr_m->size - (start - curr_m->paddr));
+	if (tsz > nr_bytes)
+		tsz = nr_bytes;
+
+	while (buflen) {
+		tmp = read_from_oldmem(buffer, tsz, &start, 1);
+		if (tmp < 0)
+			return tmp;
+		buflen -= tsz;
+		*fpos += tsz;
+		buffer += tsz;
+		acc += tsz;
+		if (start >= (curr_m->paddr + curr_m->size)) {
+			if (curr_m->list.next == &vmcore_list)
+				return acc;	/*EOF*/
+			curr_m = list_entry(curr_m->list.next,
+						struct vmcore, list);
+			start = curr_m->paddr;
+		}
+		if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen)
+			tsz = buflen;
+		/* Calculate left bytes in current memory segment. */
+		nr_bytes = (curr_m->size - (start - curr_m->paddr));
+		if (tsz > nr_bytes)
+			tsz = nr_bytes;
+	}
+	return acc;
+}
+
+static int open_vmcore(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+struct file_operations proc_vmcore_operations = {
+	.read		= read_vmcore,
+	.open		= open_vmcore,
+};
+
+static struct vmcore* __init get_new_element(void)
+{
+	struct vmcore *p;
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p)
+		memset(p, 0, sizeof(*p));
+	return p;
+}
+
+static u64 __init get_vmcore_size_elf64(char *elfptr)
+{
+	int i;
+	u64 size;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	size = sizeof(Elf64_Ehdr) + ((ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++) {
+		size += phdr_ptr->p_memsz;
+		phdr_ptr++;
+	}
+	return size;
+}
+
+/* Merges all the PT_NOTE headers into one. */
+static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
+						struct list_head *vc_list)
+{
+	int i, nr_ptnote=0, rc=0;
+	char *tmp;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr phdr, *phdr_ptr;
+	Elf64_Nhdr *nhdr_ptr;
+	u64 phdr_sz = 0, note_off;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr));
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		int j;
+		void *notes_section;
+		struct vmcore *new;
+		u64 offset, max_sz, sz, real_sz = 0;
+		if (phdr_ptr->p_type != PT_NOTE)
+			continue;
+		nr_ptnote++;
+		max_sz = phdr_ptr->p_memsz;
+		offset = phdr_ptr->p_offset;
+		notes_section = kmalloc(max_sz, GFP_KERNEL);
+		if (!notes_section)
+			return -ENOMEM;
+		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		if (rc < 0) {
+			kfree(notes_section);
+			return rc;
+		}
+		nhdr_ptr = notes_section;
+		for (j = 0; j < max_sz; j += sz) {
+			if (nhdr_ptr->n_namesz == 0)
+				break;
+			sz = sizeof(Elf64_Nhdr) +
+				((nhdr_ptr->n_namesz + 3) & ~3) +
+				((nhdr_ptr->n_descsz + 3) & ~3);
+			real_sz += sz;
+			nhdr_ptr = (Elf64_Nhdr*)((char*)nhdr_ptr + sz);
+		}
+
+		/* Add this contiguous chunk of notes section to vmcore list.*/
+		new = get_new_element();
+		if (!new) {
+			kfree(notes_section);
+			return -ENOMEM;
+		}
+		new->paddr = phdr_ptr->p_offset;
+		new->size = real_sz;
+		list_add_tail(&new->list, vc_list);
+		phdr_sz += real_sz;
+		kfree(notes_section);
+	}
+
+	/* Prepare merged PT_NOTE program header. */
+	phdr.p_type    = PT_NOTE;
+	phdr.p_flags   = 0;
+	note_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum - nr_ptnote +1) * sizeof(Elf64_Phdr);
+	phdr.p_offset  = note_off;
+	phdr.p_vaddr   = phdr.p_paddr = 0;
+	phdr.p_filesz  = phdr.p_memsz = phdr_sz;
+	phdr.p_align   = 0;
+
+	/* Add merged PT_NOTE program header*/
+	tmp = elfptr + sizeof(Elf64_Ehdr);
+	memcpy(tmp, &phdr, sizeof(phdr));
+	tmp += sizeof(phdr);
+
+	/* Remove unwanted PT_NOTE program headers. */
+	i = (nr_ptnote - 1) * sizeof(Elf64_Phdr);
+	*elfsz = *elfsz - i;
+	memmove(tmp, tmp+i, ((*elfsz)-sizeof(Elf64_Ehdr)-sizeof(Elf64_Phdr)));
+
+	/* Modify e_phnum to reflect merged headers. */
+	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
+
+	return 0;
+}
+
+/* Add memory chunks represented by program headers to vmcore list. Also update
+ * the new offset fields of exported program headers. */
+static int __init process_ptload_program_headers_elf64(char *elfptr,
+						size_t elfsz,
+						struct list_head *vc_list)
+{
+	int i;
+	Elf64_Ehdr *ehdr_ptr;
+	Elf64_Phdr *phdr_ptr;
+	loff_t vmcore_off;
+	struct vmcore *new;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+	phdr_ptr = (Elf64_Phdr*)(elfptr + sizeof(Elf64_Ehdr)); /* PT_NOTE hdr */
+
+	/* First program header is PT_NOTE header. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr) +
+			phdr_ptr->p_memsz; /* Note sections */
+
+	for (i = 0; i < ehdr_ptr->e_phnum; i++, phdr_ptr++) {
+		if (phdr_ptr->p_type != PT_LOAD)
+			continue;
+
+		/* Add this contiguous chunk of memory to vmcore list.*/
+		new = get_new_element();
+		if (!new)
+			return -ENOMEM;
+		new->paddr = phdr_ptr->p_offset;
+		new->size = phdr_ptr->p_memsz;
+		list_add_tail(&new->list, vc_list);
+
+		/* Update the program header offset. */
+		phdr_ptr->p_offset = vmcore_off;
+		vmcore_off = vmcore_off + phdr_ptr->p_memsz;
+	}
+	return 0;
+}
+
+/* Sets offset fields of vmcore elements. */
+static void __init set_vmcore_list_offsets_elf64(char *elfptr,
+						struct list_head *vc_list)
+{
+	loff_t vmcore_off;
+	Elf64_Ehdr *ehdr_ptr;
+	struct vmcore *m;
+
+	ehdr_ptr = (Elf64_Ehdr *)elfptr;
+
+	/* Skip Elf header and program headers. */
+	vmcore_off = sizeof(Elf64_Ehdr) +
+			(ehdr_ptr->e_phnum) * sizeof(Elf64_Phdr);
+
+	list_for_each_entry(m, vc_list, list) {
+		m->offset = vmcore_off;
+		vmcore_off += m->size;
+	}
+}
+
+static int __init parse_crash_elf64_headers(void)
+{
+	int rc=0;
+	Elf64_Ehdr ehdr;
+	u64 addr;
+
+	addr = elfcorehdr_addr;
+
+	/* Read Elf header */
+	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+	if (rc < 0)
+		return rc;
+
+	/* Do some basic Verification. */
+	if (memcmp(ehdr.e_ident, ELFMAG, SELFMAG) != 0 ||
+		(ehdr.e_type != ET_CORE) ||
+		!elf_check_arch(&ehdr) ||
+		ehdr.e_ident[EI_CLASS] != ELFCLASS64 ||
+		ehdr.e_ident[EI_VERSION] != EV_CURRENT ||
+		ehdr.e_version != EV_CURRENT ||
+		ehdr.e_ehsize != sizeof(Elf64_Ehdr) ||
+		ehdr.e_phentsize != sizeof(Elf64_Phdr) ||
+		ehdr.e_phnum == 0) {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					"sane\n");
+		return -EINVAL;
+	}
+
+	/* Read in all elf headers. */
+	elfcorebuf_sz = sizeof(Elf64_Ehdr) + ehdr.e_phnum * sizeof(Elf64_Phdr);
+	elfcorebuf = kmalloc(elfcorebuf_sz, GFP_KERNEL);
+	if (!elfcorebuf)
+		return -ENOMEM;
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz, &addr, 0);
+	if (rc < 0) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+
+	/* Merge all PT_NOTE headers into one. */
+	rc = merge_note_headers_elf64(elfcorebuf, &elfcorebuf_sz, &vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	rc = process_ptload_program_headers_elf64(elfcorebuf, elfcorebuf_sz,
+							&vmcore_list);
+	if (rc) {
+		kfree(elfcorebuf);
+		return rc;
+	}
+	set_vmcore_list_offsets_elf64(elfcorebuf, &vmcore_list);
+	return 0;
+}
+
+static int __init parse_crash_elf_headers(void)
+{
+	unsigned char e_ident[EI_NIDENT];
+	u64 addr;
+	int rc=0;
+
+	addr = elfcorehdr_addr;
+	rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0);
+	if (rc < 0)
+		return rc;
+	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
+		printk(KERN_WARNING "Warning: Core image elf header"
+					" not found\n");
+		return -EINVAL;
+	}
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		rc = parse_crash_elf64_headers();
+		if (rc)
+			return rc;
+
+		/* Determine vmcore size. */
+		vmcore_size = get_vmcore_size_elf64(elfcorebuf);
+	} else {
+		printk(KERN_WARNING "Warning: Core image elf header is not"
+					" sane\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Init function for vmcore module. */
+static int __init vmcore_init(void)
+{
+	int rc = 0;
+
+	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
+	if (!(elfcorehdr_addr < ELFCORE_ADDR_MAX))
+		return rc;
+	rc = parse_crash_elf_headers();
+	if (rc) {
+		printk(KERN_WARNING "Kdump: vmcore not initialized\n");
+		return rc;
+	}
+
+	/* Initialize /proc/vmcore size if proc is already up. */
+	if (proc_vmcore)
+		proc_vmcore->size = vmcore_size;
+	return 0;
+}
+module_init(vmcore_init)
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 3f25fd1eaa4b..534d750d922d 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -7,8 +7,12 @@
 #include <linux/device.h>
 #include <linux/proc_fs.h>
 
+#define ELFCORE_ADDR_MAX	(-1ULL)
 extern unsigned long long elfcorehdr_addr;
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
+extern struct file_operations proc_vmcore_operations;
+extern struct proc_dir_entry *proc_vmcore;
+
 #endif /* CONFIG_CRASH_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 59e505261fd6..0563581e3a02 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -74,6 +74,13 @@ struct kcore_list {
 	size_t size;
 };
 
+struct vmcore {
+	struct list_head list;
+	unsigned long long paddr;
+	unsigned long size;
+	loff_t offset;
+};
+
 #ifdef CONFIG_PROC_FS
 
 extern struct proc_dir_entry proc_root;
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 10b966c3744c..459ba49e376a 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -16,7 +16,7 @@
 #include <asm/uaccess.h>
 
 /* Stores the physical address of elf header of crash image. */
-unsigned long long elfcorehdr_addr;
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
 /*
  * Copy a page from "oldmem". For this page, there is no pte mapped
-- 
cgit v1.2.3-59-g8ed1b


From 6e274d144302068a00794ec22e73520c0615cb6f Mon Sep 17 00:00:00 2001
From: Alexander Nyberg <alexn@telia.com>
Date: Sat, 25 Jun 2005 14:58:26 -0700
Subject: [PATCH] kdump: Use real pt_regs from exception

Makes kexec_crashdump() take a pt_regs * as an argument.  This allows to
get exact register state at the point of the crash.  If we come from direct
panic assertion NULL will be passed and the current registers saved before
crashdump.

This hooks into two places:
die(): check the conditions under which we will panic when calling
do_exit and go there directly with the pt_regs that caused the fatal
fault.

die_nmi(): If we receive an NMI lockup while in the kernel use the
pt_regs and go directly to crash_kexec(). We're probably nested up badly
at this point so this might be the only chance to escape with proper
information.

Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c          | 36 ++++++++++++++++++++++++------------
 arch/i386/kernel/traps.c          | 17 +++++++++++++++++
 arch/ppc/kernel/machine_kexec.c   |  2 +-
 arch/ppc64/kernel/machine_kexec.c |  2 +-
 arch/s390/kernel/crash.c          |  2 +-
 arch/x86_64/kernel/crash.c        |  2 +-
 drivers/char/sysrq.c              |  2 +-
 include/linux/kexec.h             |  8 ++++++--
 include/linux/reboot.h            |  3 ++-
 kernel/kexec.c                    | 13 +++++++++++--
 kernel/panic.c                    |  2 +-
 11 files changed, 66 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index a021681d21f8..8bdb4b6af0ff 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -100,12 +100,31 @@ static void crash_get_current_regs(struct pt_regs *regs)
 	regs->eip = (unsigned long)current_text_addr();
 }
 
-static void crash_save_self(void)
+/* CPU does not save ss and esp on stack if execution is already
+ * running in kernel mode at the time of NMI occurrence. This code
+ * fixes it.
+ */
+static void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs)
+{
+	memcpy(newregs, oldregs, sizeof(*newregs));
+	newregs->esp = (unsigned long)&(oldregs->esp);
+	__asm__ __volatile__("xorl %eax, %eax;");
+	__asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(newregs->xss));
+}
+
+/* We may have saved_regs from where the error came from
+ * or it is NULL if via a direct panic().
+ */
+static void crash_save_self(struct pt_regs *saved_regs)
 {
 	struct pt_regs regs;
 	int cpu;
 	cpu = smp_processor_id();
-	crash_get_current_regs(&regs);
+
+	if (saved_regs)
+		crash_setup_regs(&regs, saved_regs);
+	else
+		crash_get_current_regs(&regs);
 	crash_save_this_cpu(&regs, cpu);
 }
 
@@ -124,15 +143,8 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 		return 1;
 	local_irq_disable();
 
-	/* CPU does not save ss and esp on stack if execution is already
-	 * running in kernel mode at the time of NMI occurrence. This code
-	 * fixes it.
-	 */
 	if (!user_mode(regs)) {
-		memcpy(&fixed_regs, regs, sizeof(*regs));
-		fixed_regs.esp = (unsigned long)&(regs->esp);
-		__asm__ __volatile__("xorl %eax, %eax;");
-		__asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(fixed_regs.xss));
+		crash_setup_regs(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
 	crash_save_this_cpu(regs, cpu);
@@ -184,7 +196,7 @@ static void nmi_shootdown_cpus(void)
 }
 #endif
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
 	 * has paniced or is otherwise in a critical state.
@@ -204,5 +216,5 @@ void machine_crash_shutdown(void)
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();
 #endif
-	crash_save_self();
+	crash_save_self(regs);
 }
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 207ea8ba7169..e458463ebc05 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -27,6 +27,7 @@
 #include <linux/ptrace.h>
 #include <linux/utsname.h>
 #include <linux/kprobes.h>
+#include <linux/kexec.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -294,6 +295,9 @@ bug:
 	printk("Kernel BUG\n");
 }
 
+/* This is gone through when something in the kernel
+ * has done something bad and is about to be terminated.
+*/
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
@@ -341,6 +345,10 @@ void die(const char * str, struct pt_regs * regs, long err)
 	bust_spinlocks(0);
 	die.lock_owner = -1;
 	spin_unlock_irq(&die.lock);
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 
@@ -570,6 +578,15 @@ void die_nmi (struct pt_regs *regs, const char *msg)
 	console_silent();
 	spin_unlock(&nmi_print_lock);
 	bust_spinlocks(0);
+
+	/* If we are in kernel we are probably nested up pretty bad
+	 * and might aswell get out now while we still can.
+	*/
+	if (!user_mode(regs)) {
+		current->thread.trap_no = 2;
+		crash_kexec(regs);
+	}
+
 	do_exit(SIGSEGV);
 }
 
diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c
index 435ad9ea0a83..b82535357d6d 100644
--- a/arch/ppc/kernel/machine_kexec.c
+++ b/arch/ppc/kernel/machine_kexec.c
@@ -34,7 +34,7 @@ void machine_shutdown(void)
 	}
 }
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	if (ppc_md.machine_crash_shutdown) {
 		ppc_md.machine_crash_shutdown();
diff --git a/arch/ppc64/kernel/machine_kexec.c b/arch/ppc64/kernel/machine_kexec.c
index 217965d60a45..06b25b59c8a8 100644
--- a/arch/ppc64/kernel/machine_kexec.c
+++ b/arch/ppc64/kernel/machine_kexec.c
@@ -34,7 +34,7 @@ note_buf_t crash_notes[NR_CPUS];
  * and if what it will achieve. Letting it be now to compile the code
  * in generic kexec environment
  */
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* do nothing right now */
 	/* smp_relase_cpus() if we want smp on panic kernel */
diff --git a/arch/s390/kernel/crash.c b/arch/s390/kernel/crash.c
index db38283c1f27..7bd169c58b0c 100644
--- a/arch/s390/kernel/crash.c
+++ b/arch/s390/kernel/crash.c
@@ -12,6 +12,6 @@
 
 note_buf_t crash_notes[NR_CPUS];
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 }
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
index 6183bcb85257..d7fa4248501c 100644
--- a/arch/x86_64/kernel/crash.c
+++ b/arch/x86_64/kernel/crash.c
@@ -22,7 +22,7 @@
 
 note_buf_t crash_notes[NR_CPUS];
 
-void machine_crash_shutdown(void)
+void machine_crash_shutdown(struct pt_regs *regs)
 {
 	/* This function is only called after the system
 	 * has paniced or is otherwise in a critical state.
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index 53b2c8fab00e..af79805b5576 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -100,7 +100,7 @@ static struct sysrq_key_op sysrq_unraw_op = {
 static void sysrq_handle_crashdump(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty)
 {
-	crash_kexec();
+	crash_kexec(pt_regs);
 }
 static struct sysrq_key_op sysrq_crashdump_op = {
 	.handler	= sysrq_handle_crashdump,
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 0653a27c3d72..7383173a3a9c 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -99,7 +99,8 @@ extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
 	unsigned long flags);
 #endif
 extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
-extern void crash_kexec(void);
+extern void crash_kexec(struct pt_regs *);
+int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
 
 #define KEXEC_ON_CRASH  0x00000001
@@ -123,6 +124,9 @@ extern struct kimage *kexec_image;
 extern struct resource crashk_res;
 
 #else /* !CONFIG_KEXEC */
-static inline void crash_kexec(void) { }
+struct pt_regs;
+struct task_struct;
+static inline void crash_kexec(struct pt_regs *regs) { }
+static inline int kexec_should_crash(struct task_struct *p) { return 0; }
 #endif /* CONFIG_KEXEC */
 #endif /* LINUX_KEXEC_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index c5a05e16edb2..2d4dd23168dd 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -52,7 +52,8 @@ extern void machine_halt(void);
 extern void machine_power_off(void);
 
 extern void machine_shutdown(void);
-extern void machine_crash_shutdown(void);
+struct pt_regs;
+extern void machine_crash_shutdown(struct pt_regs *);
 
 #endif
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a0411b3bd54a..277f22afe74b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -18,6 +18,8 @@
 #include <linux/reboot.h>
 #include <linux/syscalls.h>
 #include <linux/ioport.h>
+#include <linux/hardirq.h>
+
 #include <asm/page.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -32,6 +34,13 @@ struct resource crashk_res = {
 	.flags = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+int kexec_should_crash(struct task_struct *p)
+{
+	if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+		return 1;
+	return 0;
+}
+
 /*
  * When kexec transitions to the new kernel there is a one-to-one
  * mapping between physical and virtual addresses.  On processors
@@ -1010,7 +1019,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 }
 #endif
 
-void crash_kexec(void)
+void crash_kexec(struct pt_regs *regs)
 {
 	struct kimage *image;
 	int locked;
@@ -1028,7 +1037,7 @@ void crash_kexec(void)
 	if (!locked) {
 		image = xchg(&kexec_crash_image, NULL);
 		if (image) {
-			machine_crash_shutdown();
+			machine_crash_shutdown(regs);
 			machine_kexec(image);
 		}
 		xchg(&kexec_lock, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index 66f43d33cd80..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -83,7 +83,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 * everything else.
 	 * Do we want to call this before we try to display a message?
 	 */
-	crash_kexec();
+	crash_kexec(NULL);
 
 #ifdef CONFIG_SMP
 	/*
-- 
cgit v1.2.3-59-g8ed1b


From 72414d3f1d22fc3e311b162fca95c430048d38ce Mon Sep 17 00:00:00 2001
From: Maneesh Soni <maneesh@in.ibm.com>
Date: Sat, 25 Jun 2005 14:58:28 -0700
Subject: [PATCH] kexec code cleanup

o Following patch provides purely cosmetic changes and corrects CodingStyle
  guide lines related certain issues like below in kexec related files

  o braces for one line "if" statements, "for" loops,
  o more than 80 column wide lines,
  o No space after "while", "for" and "switch" key words

o Changes:
  o take-2: Removed the extra tab before "case" key words.
  o take-3: Put operator at the end of line and space before "*/"

Signed-off-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/crash.c           |  23 +--
 arch/i386/kernel/machine_kexec.c   |  16 +-
 arch/ppc/kernel/machine_kexec.c    |  30 ++--
 arch/ppc64/kernel/machine_kexec.c  |   9 +-
 arch/s390/kernel/machine_kexec.c   |   4 +-
 arch/x86_64/kernel/machine_kexec.c |  49 +++---
 drivers/char/mem.c                 |   2 +-
 include/linux/kexec.h              |  13 +-
 include/linux/syscalls.h           |   6 +-
 kernel/kexec.c                     | 302 ++++++++++++++++++++-----------------
 10 files changed, 243 insertions(+), 211 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index 8bdb4b6af0ff..e5fab12f7926 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -31,10 +31,11 @@ note_buf_t crash_notes[NR_CPUS];
 /* This keeps a track of which one is crashing cpu. */
 static int crashing_cpu;
 
-static u32 *append_elf_note(u32 *buf,
-	char *name, unsigned type, void *data, size_t data_len)
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+							       size_t data_len)
 {
 	struct elf_note note;
+
 	note.n_namesz = strlen(name) + 1;
 	note.n_descsz = data_len;
 	note.n_type   = type;
@@ -44,26 +45,28 @@ static u32 *append_elf_note(u32 *buf,
 	buf += (note.n_namesz + 3)/4;
 	memcpy(buf, data, note.n_descsz);
 	buf += (note.n_descsz + 3)/4;
+
 	return buf;
 }
 
 static void final_note(u32 *buf)
 {
 	struct elf_note note;
+
 	note.n_namesz = 0;
 	note.n_descsz = 0;
 	note.n_type   = 0;
 	memcpy(buf, &note, sizeof(note));
 }
 
-
 static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
 {
 	struct elf_prstatus prstatus;
 	u32 *buf;
-	if ((cpu < 0) || (cpu >= NR_CPUS)) {
+
+	if ((cpu < 0) || (cpu >= NR_CPUS))
 		return;
-	}
+
 	/* Using ELF notes here is opportunistic.
 	 * I need a well defined structure format
 	 * for the data I pass, and I need tags
@@ -75,9 +78,8 @@ static void crash_save_this_cpu(struct pt_regs *regs, int cpu)
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
 	elf_core_copy_regs(&prstatus.pr_reg, regs);
-	buf = append_elf_note(buf, "CORE", NT_PRSTATUS,
-		&prstatus, sizeof(prstatus));
-
+	buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
+				sizeof(prstatus));
 	final_note(buf);
 }
 
@@ -119,8 +121,8 @@ static void crash_save_self(struct pt_regs *saved_regs)
 {
 	struct pt_regs regs;
 	int cpu;
-	cpu = smp_processor_id();
 
+	cpu = smp_processor_id();
 	if (saved_regs)
 		crash_setup_regs(&regs, saved_regs);
 	else
@@ -153,6 +155,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	/* Assume hlt works */
 	__asm__("hlt");
 	for(;;);
+
 	return 1;
 }
 
@@ -169,8 +172,8 @@ static void smp_send_nmi_allbutself(void)
 static void nmi_shootdown_cpus(void)
 {
 	unsigned long msecs;
-	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 
+	atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
 	/* Would it be better to replace the trap vector here? */
 	set_nmi_callback(crash_nmi_callback);
 	/* Ensure the new callback function is set before sending
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index 671880415d1c..52ed18d8b511 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -80,7 +80,8 @@ static void identity_map_page(unsigned long address)
 	/* Identity map the page table entry */
 	pgtable_level1[level1_index] = address | L0_ATTR;
 	pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR;
-	set_64bit(&pgtable_level3[level3_index], __pa(pgtable_level2) | L2_ATTR);
+	set_64bit(&pgtable_level3[level3_index],
+					       __pa(pgtable_level2) | L2_ATTR);
 
 	/* Flush the tlb so the new mapping takes effect.
 	 * Global tlb entries are not flushed but that is not an issue.
@@ -139,8 +140,10 @@ static void load_segments(void)
 }
 
 typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long reboot_code_buffer,
-	unsigned long start_address, unsigned int has_pae) ATTRIB_NORET;
+					unsigned long indirection_page,
+					unsigned long reboot_code_buffer,
+					unsigned long start_address,
+					unsigned int has_pae) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 extern void relocate_new_kernel_end(void);
@@ -180,20 +183,23 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list;
 	unsigned long reboot_code_buffer;
+
 	relocate_new_kernel_t rnk;
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
 	/* Compute some offsets */
-	reboot_code_buffer = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	reboot_code_buffer = page_to_pfn(image->control_code_page)
+								<< PAGE_SHIFT;
 	page_list = image->head;
 
 	/* Set up an identity mapping for the reboot_code_buffer */
 	identity_map_page(reboot_code_buffer);
 
 	/* copy it out */
-	memcpy((void *)reboot_code_buffer, relocate_new_kernel, relocate_new_kernel_size);
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	/* The segment registers are funny things, they are
 	 * automatically loaded from a table, in memory wherever you
diff --git a/arch/ppc/kernel/machine_kexec.c b/arch/ppc/kernel/machine_kexec.c
index b82535357d6d..84d65a87191e 100644
--- a/arch/ppc/kernel/machine_kexec.c
+++ b/arch/ppc/kernel/machine_kexec.c
@@ -21,24 +21,23 @@
 #include <asm/machdep.h>
 
 typedef NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long reboot_code_buffer,
-	unsigned long start_address) ATTRIB_NORET;
+				unsigned long indirection_page,
+				unsigned long reboot_code_buffer,
+				unsigned long start_address) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned int relocate_new_kernel_size;
 
 void machine_shutdown(void)
 {
-	if (ppc_md.machine_shutdown) {
+	if (ppc_md.machine_shutdown)
 		ppc_md.machine_shutdown();
-	}
 }
 
 void machine_crash_shutdown(struct pt_regs *regs)
 {
-	if (ppc_md.machine_crash_shutdown) {
+	if (ppc_md.machine_crash_shutdown)
 		ppc_md.machine_crash_shutdown();
-	}
 }
 
 /*
@@ -48,9 +47,8 @@ void machine_crash_shutdown(struct pt_regs *regs)
  */
 int machine_kexec_prepare(struct kimage *image)
 {
-	if (ppc_md.machine_kexec_prepare) {
+	if (ppc_md.machine_kexec_prepare)
 		return ppc_md.machine_kexec_prepare(image);
-	}
 	/*
 	 * Fail if platform doesn't provide its own machine_kexec_prepare
 	 * implementation.
@@ -60,9 +58,8 @@ int machine_kexec_prepare(struct kimage *image)
 
 void machine_kexec_cleanup(struct kimage *image)
 {
-	if (ppc_md.machine_kexec_cleanup) {
+	if (ppc_md.machine_kexec_cleanup)
 		ppc_md.machine_kexec_cleanup(image);
-	}
 }
 
 /*
@@ -71,9 +68,9 @@ void machine_kexec_cleanup(struct kimage *image)
  */
 NORET_TYPE void machine_kexec(struct kimage *image)
 {
-	if (ppc_md.machine_kexec) {
+	if (ppc_md.machine_kexec)
 		ppc_md.machine_kexec(image);
-	} else {
+	else {
 		/*
 		 * Fall back to normal restart if platform doesn't provide
 		 * its own kexec function, and user insist to kexec...
@@ -83,7 +80,6 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	for(;;);
 }
 
-
 /*
  * This is a generic machine_kexec function suitable at least for
  * non-OpenFirmware embedded platforms.
@@ -104,15 +100,15 @@ void machine_kexec_simple(struct kimage *image)
 
 	/* we need both effective and real address here */
 	reboot_code_buffer =
-		(unsigned long)page_address(image->control_code_page);
+			(unsigned long)page_address(image->control_code_page);
 	reboot_code_buffer_phys = virt_to_phys((void *)reboot_code_buffer);
 
 	/* copy our kernel relocation code to the control code page */
-	memcpy((void *)reboot_code_buffer,
-		relocate_new_kernel, relocate_new_kernel_size);
+	memcpy((void *)reboot_code_buffer, relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	flush_icache_range(reboot_code_buffer,
-		reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE);
+				reboot_code_buffer + KEXEC_CONTROL_CODE_SIZE);
 	printk(KERN_INFO "Bye!\n");
 
 	/* now call it */
diff --git a/arch/ppc64/kernel/machine_kexec.c b/arch/ppc64/kernel/machine_kexec.c
index 06b25b59c8a8..fdb2fc649d72 100644
--- a/arch/ppc64/kernel/machine_kexec.c
+++ b/arch/ppc64/kernel/machine_kexec.c
@@ -58,7 +58,7 @@ int machine_kexec_prepare(struct kimage *image)
 	 * handle the virtual mode, we must make sure no destination
 	 * overlaps kernel static data or bss.
 	 */
-	for(i = 0; i < image->nr_segments; i++)
+	for (i = 0; i < image->nr_segments; i++)
 		if (image->segment[i].mem < __pa(_end))
 			return -ETXTBSY;
 
@@ -76,7 +76,7 @@ int machine_kexec_prepare(struct kimage *image)
 		low = __pa(htab_address);
 		high = low + (htab_hash_mask + 1) * HASH_GROUP_SIZE;
 
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			begin = image->segment[i].mem;
 			end = begin + image->segment[i].memsz;
 
@@ -98,7 +98,7 @@ int machine_kexec_prepare(struct kimage *image)
 		low = *basep;
 		high = low + (*sizep);
 
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			begin = image->segment[i].mem;
 			end = begin + image->segment[i].memsz;
 
@@ -274,7 +274,8 @@ union thread_union kexec_stack
 
 /* Our assembly helper, in kexec_stub.S */
 extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start,
-	void *image, void *control, void (*clear_all)(void)) ATTRIB_NORET;
+					void *image, void *control,
+					void (*clear_all)(void)) ATTRIB_NORET;
 
 /* too late to fail here */
 void machine_kexec(struct kimage *image)
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 7a94db76df46..2721c3a32b84 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -67,7 +67,7 @@ machine_kexec(struct kimage *image)
 	ctl_clear_bit(0,28);
 
 	on_each_cpu(kexec_halt_all_cpus, image, 0, 0);
-	for(;;);
+	for (;;);
 }
 
 static void
@@ -85,7 +85,7 @@ kexec_halt_all_cpus(void *kernel_image)
 	for_each_online_cpu(cpu) {
 		if (cpu == smp_processor_id())
 			continue;
-		while(!smp_cpu_not_running(cpu))
+		while (!smp_cpu_not_running(cpu))
 			cpu_relax();
 	}
 
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
index 200b5993f8d9..60d1eff41567 100644
--- a/arch/x86_64/kernel/machine_kexec.c
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -32,29 +32,31 @@
 #define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
-static void init_level2_page(
-	u64 *level2p, unsigned long addr)
+static void init_level2_page(u64 *level2p, unsigned long addr)
 {
 	unsigned long end_addr;
+
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL2_SIZE;
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level2p++) = addr | L1_ATTR;
 		addr += LEVEL1_SIZE;
 	}
 }
 
-static int init_level3_page(struct kimage *image,
-	u64 *level3p, unsigned long addr, unsigned long last_addr)
+static int init_level3_page(struct kimage *image, u64 *level3p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL3_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level2p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -66,7 +68,7 @@ static int init_level3_page(struct kimage *image,
 		addr += LEVEL2_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level3p++) = 0;
 		addr += LEVEL2_SIZE;
 	}
@@ -75,17 +77,19 @@ out:
 }
 
 
-static int init_level4_page(struct kimage *image,
-	u64 *level4p, unsigned long addr, unsigned long last_addr)
+static int init_level4_page(struct kimage *image, u64 *level4p,
+				unsigned long addr, unsigned long last_addr)
 {
 	unsigned long end_addr;
 	int result;
+
 	result = 0;
 	addr &= PAGE_MASK;
 	end_addr = addr + LEVEL4_SIZE;
-	while((addr < last_addr) && (addr < end_addr)) {
+	while ((addr < last_addr) && (addr < end_addr)) {
 		struct page *page;
 		u64 *level3p;
+
 		page = kimage_alloc_control_pages(image, 0);
 		if (!page) {
 			result = -ENOMEM;
@@ -100,11 +104,11 @@ static int init_level4_page(struct kimage *image,
 		addr += LEVEL3_SIZE;
 	}
 	/* clear the unused entries */
-	while(addr < end_addr) {
+	while (addr < end_addr) {
 		*(level4p++) = 0;
 		addr += LEVEL3_SIZE;
 	}
- out:
+out:
 	return result;
 }
 
@@ -113,7 +117,7 @@ static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
 {
 	u64 *level4p;
 	level4p = (u64 *)__va(start_pgtable);
-	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+ 	return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
 }
 
 static void set_idt(void *newidt, u16 limit)
@@ -159,9 +163,10 @@ static void load_segments(void)
 #undef __STR
 }
 
-typedef NORET_TYPE void (*relocate_new_kernel_t)(
-	unsigned long indirection_page, unsigned long control_code_buffer,
-	unsigned long start_address, unsigned long pgtable) ATTRIB_NORET;
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+					unsigned long control_code_buffer,
+					unsigned long start_address,
+					unsigned long pgtable) ATTRIB_NORET;
 
 const extern unsigned char relocate_new_kernel[];
 const extern unsigned long relocate_new_kernel_size;
@@ -172,17 +177,17 @@ int machine_kexec_prepare(struct kimage *image)
 	int result;
 
 	/* Calculate the offsets */
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Setup the identity mapped 64bit page table */
 	result = init_pgtable(image, start_pgtable);
-	if (result) {
+	if (result)
 		return result;
-	}
 
 	/* Place the code in the reboot code buffer */
-	memcpy(__va(control_code_buffer), relocate_new_kernel, relocate_new_kernel_size);
+	memcpy(__va(control_code_buffer), relocate_new_kernel,
+						relocate_new_kernel_size);
 
 	return 0;
 }
@@ -207,8 +212,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	local_irq_disable();
 
 	/* Calculate the offsets */
-	page_list           = image->head;
-	start_pgtable       = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+	page_list = image->head;
+	start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
 	control_code_buffer = start_pgtable + 4096UL;
 
 	/* Set the low half of the page table to my identity mapped
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index b64108dd765b..42187381506b 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -287,7 +287,7 @@ static ssize_t read_oldmem(struct file *file, char __user *buf,
 	size_t read = 0, csize;
 	int rc = 0;
 
-	while(count) {
+	while (count) {
 		pfn = *ppos / PAGE_SIZE;
 		if (pfn > saved_max_pfn)
 			return read;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 7383173a3a9c..c8468472aec0 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -91,14 +91,17 @@ extern NORET_TYPE void machine_kexec(struct kimage *image) ATTRIB_NORET;
 extern int machine_kexec_prepare(struct kimage *image);
 extern void machine_kexec_cleanup(struct kimage *image);
 extern asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags);
+					unsigned long nr_segments,
+					struct kexec_segment __user *segments,
+					unsigned long flags);
 #ifdef CONFIG_COMPAT
 extern asmlinkage long compat_sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
-	unsigned long flags);
+				unsigned long nr_segments,
+				struct compat_kexec_segment __user *segments,
+				unsigned long flags);
 #endif
-extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order);
+extern struct page *kimage_alloc_control_pages(struct kimage *image,
+						unsigned int order);
 extern void crash_kexec(struct pt_regs *);
 int kexec_should_crash(struct task_struct *);
 extern struct kimage *kexec_image;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7ba8f8f747aa..52830b6d94e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -159,9 +159,9 @@ asmlinkage long sys_shutdown(int, int);
 asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
 				void __user *arg);
 asmlinkage long sys_restart_syscall(void);
-asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags);
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+				struct kexec_segment __user *segments,
+				unsigned long flags);
 
 asmlinkage long sys_exit(int error_code);
 asmlinkage void sys_exit_group(int error_code);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 277f22afe74b..7843548cf2d9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -87,12 +87,15 @@ int kexec_should_crash(struct task_struct *p)
  */
 #define KIMAGE_NO_DEST (-1UL)
 
-static int kimage_is_destination_range(
-	struct kimage *image, unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+static int kimage_is_destination_range(struct kimage *image,
+				       unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+				       unsigned int gfp_mask,
+				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments)
+	                    unsigned long nr_segments,
+                            struct kexec_segment __user *segments)
 {
 	size_t segment_bytes;
 	struct kimage *image;
@@ -102,9 +105,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	/* Allocate a controlling structure */
 	result = -ENOMEM;
 	image = kmalloc(sizeof(*image), GFP_KERNEL);
-	if (!image) {
+	if (!image)
 		goto out;
-	}
+
 	memset(image, 0, sizeof(*image));
 	image->head = 0;
 	image->entry = &image->head;
@@ -145,6 +148,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
 		if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
@@ -159,12 +163,13 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * easy explanation as one segment stops on another.
 	 */
 	result = -EINVAL;
-	for(i = 0; i < nr_segments; i++) {
+	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
 		unsigned long j;
+
 		mstart = image->segment[i].mem;
 		mend   = mstart + image->segment[i].memsz;
-		for(j = 0; j < i; j++) {
+		for (j = 0; j < i; j++) {
 			unsigned long pstart, pend;
 			pstart = image->segment[j].mem;
 			pend   = pstart + image->segment[j].memsz;
@@ -180,25 +185,25 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 	 * later on.
 	 */
 	result = -EINVAL;
-	for(i = 0; i < nr_segments; i++) {
+	for (i = 0; i < nr_segments; i++) {
 		if (image->segment[i].bufsz > image->segment[i].memsz)
 			goto out;
 	}
 
-
 	result = 0;
- out:
-	if (result == 0) {
+out:
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 
 }
 
 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments)
+				unsigned long nr_segments,
+				struct kexec_segment __user *segments)
 {
 	int result;
 	struct kimage *image;
@@ -206,9 +211,9 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 	/* Allocate and initialize a controlling structure */
 	image = NULL;
 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
-	if (result) {
+	if (result)
 		goto out;
-	}
+
 	*rimage = image;
 
 	/*
@@ -218,7 +223,7 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-		get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_CODE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
@@ -226,16 +231,17 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 
 	result = 0;
  out:
-	if (result == 0) {
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 }
 
 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment *segments)
+				unsigned long nr_segments,
+				struct kexec_segment *segments)
 {
 	int result;
 	struct kimage *image;
@@ -250,9 +256,8 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 
 	/* Allocate and initialize a controlling structure */
 	result = do_kimage_alloc(&image, entry, nr_segments, segments);
-	if (result) {
+	if (result)
 		goto out;
-	}
 
 	/* Enable the special crash kernel control page
 	 * allocation policy.
@@ -272,6 +277,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	result = -EADDRNOTAVAIL;
 	for (i = 0; i < nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
 		mend = mstart + image->segment[i].memsz - 1;
 		/* Ensure we are within the crash kernel limits */
@@ -279,7 +285,6 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 			goto out;
 	}
 
-
 	/*
 	 * Find a location for the control code buffer, and add
 	 * the vector of segments so that it's pages will also be
@@ -287,80 +292,84 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 	 */
 	result = -ENOMEM;
 	image->control_code_page = kimage_alloc_control_pages(image,
-		get_order(KEXEC_CONTROL_CODE_SIZE));
+					   get_order(KEXEC_CONTROL_CODE_SIZE));
 	if (!image->control_code_page) {
 		printk(KERN_ERR "Could not allocate control_code_buffer\n");
 		goto out;
 	}
 
 	result = 0;
- out:
-	if (result == 0) {
+out:
+	if (result == 0)
 		*rimage = image;
-	} else {
+	else
 		kfree(image);
-	}
+
 	return result;
 }
 
-static int kimage_is_destination_range(
-	struct kimage *image, unsigned long start, unsigned long end)
+static int kimage_is_destination_range(struct kimage *image,
+					unsigned long start,
+					unsigned long end)
 {
 	unsigned long i;
 
 	for (i = 0; i < image->nr_segments; i++) {
 		unsigned long mstart, mend;
+
 		mstart = image->segment[i].mem;
-		mend   = mstart + image->segment[i].memsz;
-		if ((end > mstart) && (start < mend)) {
+		mend = mstart + image->segment[i].memsz;
+		if ((end > mstart) && (start < mend))
 			return 1;
-		}
 	}
+
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(unsigned int gfp_mask,
+					unsigned int order)
 {
 	struct page *pages;
+
 	pages = alloc_pages(gfp_mask, order);
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
 		pages->private = order;
 		count = 1 << order;
-		for(i = 0; i < count; i++) {
+		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
-		}
 	}
+
 	return pages;
 }
 
 static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
+
 	order = page->private;
 	count = 1 << order;
-	for(i = 0; i < count; i++) {
+	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
-	}
 	__free_pages(page, order);
 }
 
 static void kimage_free_page_list(struct list_head *list)
 {
 	struct list_head *pos, *next;
+
 	list_for_each_safe(pos, next, list) {
 		struct page *page;
 
 		page = list_entry(pos, struct page, lru);
 		list_del(&page->lru);
-
 		kimage_free_pages(page);
 	}
 }
 
-static struct page *kimage_alloc_normal_control_pages(
-	struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+							unsigned int order)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -387,6 +396,7 @@ static struct page *kimage_alloc_normal_control_pages(
 	 */
 	do {
 		unsigned long pfn, epfn, addr, eaddr;
+
 		pages = kimage_alloc_pages(GFP_KERNEL, order);
 		if (!pages)
 			break;
@@ -395,12 +405,12 @@ static struct page *kimage_alloc_normal_control_pages(
 		addr  = pfn << PAGE_SHIFT;
 		eaddr = epfn << PAGE_SHIFT;
 		if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
-			kimage_is_destination_range(image, addr, eaddr))
-		{
+			      kimage_is_destination_range(image, addr, eaddr)) {
 			list_add(&pages->lru, &extra_pages);
 			pages = NULL;
 		}
-	} while(!pages);
+	} while (!pages);
+
 	if (pages) {
 		/* Remember the allocated page... */
 		list_add(&pages->lru, &image->control_pages);
@@ -420,12 +430,12 @@ static struct page *kimage_alloc_normal_control_pages(
 	 * For now it is simpler to just free the pages.
 	 */
 	kimage_free_page_list(&extra_pages);
-	return pages;
 
+	return pages;
 }
 
-static struct page *kimage_alloc_crash_control_pages(
-	struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+						      unsigned int order)
 {
 	/* Control pages are special, they are the intermediaries
 	 * that are needed while we copy the rest of the pages
@@ -450,21 +460,22 @@ static struct page *kimage_alloc_crash_control_pages(
 	 */
 	unsigned long hole_start, hole_end, size;
 	struct page *pages;
+
 	pages = NULL;
 	size = (1 << order) << PAGE_SHIFT;
 	hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 	hole_end   = hole_start + size - 1;
-	while(hole_end <= crashk_res.end) {
+	while (hole_end <= crashk_res.end) {
 		unsigned long i;
-		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
+
+		if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 			break;
-		}
-		if (hole_end > crashk_res.end) {
+		if (hole_end > crashk_res.end)
 			break;
-		}
 		/* See if I overlap any of the segments */
-		for(i = 0; i < image->nr_segments; i++) {
+		for (i = 0; i < image->nr_segments; i++) {
 			unsigned long mstart, mend;
+
 			mstart = image->segment[i].mem;
 			mend   = mstart + image->segment[i].memsz - 1;
 			if ((hole_end >= mstart) && (hole_start <= mend)) {
@@ -480,18 +491,19 @@ static struct page *kimage_alloc_crash_control_pages(
 			break;
 		}
 	}
-	if (pages) {
+	if (pages)
 		image->control_page = hole_end;
-	}
+
 	return pages;
 }
 
 
-struct page *kimage_alloc_control_pages(
-	struct kimage *image, unsigned int order)
+struct page *kimage_alloc_control_pages(struct kimage *image,
+					 unsigned int order)
 {
 	struct page *pages = NULL;
-	switch(image->type) {
+
+	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
 		pages = kimage_alloc_normal_control_pages(image, order);
 		break;
@@ -499,43 +511,46 @@ struct page *kimage_alloc_control_pages(
 		pages = kimage_alloc_crash_control_pages(image, order);
 		break;
 	}
+
 	return pages;
 }
 
 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 {
-	if (*image->entry != 0) {
+	if (*image->entry != 0)
 		image->entry++;
-	}
+
 	if (image->entry == image->last_entry) {
 		kimage_entry_t *ind_page;
 		struct page *page;
+
 		page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
-		if (!page) {
+		if (!page)
 			return -ENOMEM;
-		}
+
 		ind_page = page_address(page);
 		*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 		image->entry = ind_page;
-		image->last_entry =
-			ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+		image->last_entry = ind_page +
+				      ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 	}
 	*image->entry = entry;
 	image->entry++;
 	*image->entry = 0;
+
 	return 0;
 }
 
-static int kimage_set_destination(
-	struct kimage *image, unsigned long destination)
+static int kimage_set_destination(struct kimage *image,
+				   unsigned long destination)
 {
 	int result;
 
 	destination &= PAGE_MASK;
 	result = kimage_add_entry(image, destination | IND_DESTINATION);
-	if (result == 0) {
+	if (result == 0)
 		image->destination = destination;
-	}
+
 	return result;
 }
 
@@ -546,9 +561,9 @@ static int kimage_add_page(struct kimage *image, unsigned long page)
 
 	page &= PAGE_MASK;
 	result = kimage_add_entry(image, page | IND_SOURCE);
-	if (result == 0) {
+	if (result == 0)
 		image->destination += PAGE_SIZE;
-	}
+
 	return result;
 }
 
@@ -564,10 +579,11 @@ static void kimage_free_extra_pages(struct kimage *image)
 }
 static int kimage_terminate(struct kimage *image)
 {
-	if (*image->entry != 0) {
+	if (*image->entry != 0)
 		image->entry++;
-	}
+
 	*image->entry = IND_DONE;
+
 	return 0;
 }
 
@@ -591,26 +607,24 @@ static void kimage_free(struct kimage *image)
 
 	if (!image)
 		return;
+
 	kimage_free_extra_pages(image);
 	for_each_kimage_entry(image, ptr, entry) {
 		if (entry & IND_INDIRECTION) {
 			/* Free the previous indirection page */
-			if (ind & IND_INDIRECTION) {
+			if (ind & IND_INDIRECTION)
 				kimage_free_entry(ind);
-			}
 			/* Save this indirection page until we are
 			 * done with it.
 			 */
 			ind = entry;
 		}
-		else if (entry & IND_SOURCE) {
+		else if (entry & IND_SOURCE)
 			kimage_free_entry(entry);
-		}
 	}
 	/* Free the final indirection page */
-	if (ind & IND_INDIRECTION) {
+	if (ind & IND_INDIRECTION)
 		kimage_free_entry(ind);
-	}
 
 	/* Handle any machine specific cleanup */
 	machine_kexec_cleanup(image);
@@ -620,26 +634,28 @@ static void kimage_free(struct kimage *image)
 	kfree(image);
 }
 
-static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+					unsigned long page)
 {
 	kimage_entry_t *ptr, entry;
 	unsigned long destination = 0;
 
 	for_each_kimage_entry(image, ptr, entry) {
-		if (entry & IND_DESTINATION) {
+		if (entry & IND_DESTINATION)
 			destination = entry & PAGE_MASK;
-		}
 		else if (entry & IND_SOURCE) {
-			if (page == destination) {
+			if (page == destination)
 				return ptr;
-			}
 			destination += PAGE_SIZE;
 		}
 	}
+
 	return 0;
 }
 
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+static struct page *kimage_alloc_page(struct kimage *image,
+					unsigned int gfp_mask,
+					unsigned long destination)
 {
 	/*
 	 * Here we implement safeguards to ensure that a source page
@@ -679,11 +695,11 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 
 		/* Allocate a page, if we run out of memory give up */
 		page = kimage_alloc_pages(gfp_mask, 0);
-		if (!page) {
+		if (!page)
 			return 0;
-		}
 		/* If the page cannot be used file it away */
-		if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+		if (page_to_pfn(page) >
+				(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 			list_add(&page->lru, &image->unuseable_pages);
 			continue;
 		}
@@ -694,7 +710,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 			break;
 
 		/* If the page is not a destination page use it */
-		if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+		if (!kimage_is_destination_range(image, addr,
+						  addr + PAGE_SIZE))
 			break;
 
 		/*
@@ -727,11 +744,12 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas
 			list_add(&page->lru, &image->dest_pages);
 		}
 	}
+
 	return page;
 }
 
 static int kimage_load_normal_segment(struct kimage *image,
-	struct kexec_segment *segment)
+					 struct kexec_segment *segment)
 {
 	unsigned long maddr;
 	unsigned long ubytes, mbytes;
@@ -745,34 +763,36 @@ static int kimage_load_normal_segment(struct kimage *image,
 	maddr = segment->mem;
 
 	result = kimage_set_destination(image, maddr);
-	if (result < 0) {
+	if (result < 0)
 		goto out;
-	}
-	while(mbytes) {
+
+	while (mbytes) {
 		struct page *page;
 		char *ptr;
 		size_t uchunk, mchunk;
+
 		page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 		if (page == 0) {
 			result  = -ENOMEM;
 			goto out;
 		}
-		result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
-		if (result < 0) {
+		result = kimage_add_page(image, page_to_pfn(page)
+								<< PAGE_SHIFT);
+		if (result < 0)
 			goto out;
-		}
+
 		ptr = kmap(page);
 		/* Start with a clear page */
 		memset(ptr, 0, PAGE_SIZE);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
-		if (mchunk > mbytes) {
+		if (mchunk > mbytes)
 			mchunk = mbytes;
-		}
+
 		uchunk = mchunk;
-		if (uchunk > ubytes) {
+		if (uchunk > ubytes)
 			uchunk = ubytes;
-		}
+
 		result = copy_from_user(ptr, buf, uchunk);
 		kunmap(page);
 		if (result) {
@@ -784,12 +804,12 @@ static int kimage_load_normal_segment(struct kimage *image,
 		buf    += mchunk;
 		mbytes -= mchunk;
 	}
- out:
+out:
 	return result;
 }
 
 static int kimage_load_crash_segment(struct kimage *image,
-	struct kexec_segment *segment)
+					struct kexec_segment *segment)
 {
 	/* For crash dumps kernels we simply copy the data from
 	 * user space to it's destination.
@@ -805,10 +825,11 @@ static int kimage_load_crash_segment(struct kimage *image,
 	ubytes = segment->bufsz;
 	mbytes = segment->memsz;
 	maddr = segment->mem;
-	while(mbytes) {
+	while (mbytes) {
 		struct page *page;
 		char *ptr;
 		size_t uchunk, mchunk;
+
 		page = pfn_to_page(maddr >> PAGE_SHIFT);
 		if (page == 0) {
 			result  = -ENOMEM;
@@ -817,9 +838,9 @@ static int kimage_load_crash_segment(struct kimage *image,
 		ptr = kmap(page);
 		ptr += maddr & ~PAGE_MASK;
 		mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
-		if (mchunk > mbytes) {
+		if (mchunk > mbytes)
 			mchunk = mbytes;
-		}
+
 		uchunk = mchunk;
 		if (uchunk > ubytes) {
 			uchunk = ubytes;
@@ -837,15 +858,16 @@ static int kimage_load_crash_segment(struct kimage *image,
 		buf    += mchunk;
 		mbytes -= mchunk;
 	}
- out:
+out:
 	return result;
 }
 
 static int kimage_load_segment(struct kimage *image,
-	struct kexec_segment *segment)
+				struct kexec_segment *segment)
 {
 	int result = -ENOMEM;
-	switch(image->type) {
+
+	switch (image->type) {
 	case KEXEC_TYPE_DEFAULT:
 		result = kimage_load_normal_segment(image, segment);
 		break;
@@ -853,6 +875,7 @@ static int kimage_load_segment(struct kimage *image,
 		result = kimage_load_crash_segment(image, segment);
 		break;
 	}
+
 	return result;
 }
 
@@ -885,9 +908,9 @@ static struct kimage *kexec_crash_image = NULL;
  */
 static int kexec_lock = 0;
 
-asmlinkage long sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct kexec_segment __user *segments,
-	unsigned long flags)
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+				struct kexec_segment __user *segments,
+				unsigned long flags)
 {
 	struct kimage **dest_image, *image;
 	int locked;
@@ -907,9 +930,7 @@ asmlinkage long sys_kexec_load(unsigned long entry,
 	/* Verify we are on the appropriate architecture */
 	if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 		((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
-	{
 		return -EINVAL;
-	}
 
 	/* Put an artificial cap on the number
 	 * of segments passed to kexec_load.
@@ -929,58 +950,59 @@ asmlinkage long sys_kexec_load(unsigned long entry,
 	 * KISS: always take the mutex.
 	 */
 	locked = xchg(&kexec_lock, 1);
-	if (locked) {
+	if (locked)
 		return -EBUSY;
-	}
+
 	dest_image = &kexec_image;
-	if (flags & KEXEC_ON_CRASH) {
+	if (flags & KEXEC_ON_CRASH)
 		dest_image = &kexec_crash_image;
-	}
 	if (nr_segments > 0) {
 		unsigned long i;
+
 		/* Loading another kernel to reboot into */
-		if ((flags & KEXEC_ON_CRASH) == 0) {
-			result = kimage_normal_alloc(&image, entry, nr_segments, segments);
-		}
+		if ((flags & KEXEC_ON_CRASH) == 0)
+			result = kimage_normal_alloc(&image, entry,
+							nr_segments, segments);
 		/* Loading another kernel to switch to if this one crashes */
 		else if (flags & KEXEC_ON_CRASH) {
 			/* Free any current crash dump kernel before
 			 * we corrupt it.
 			 */
 			kimage_free(xchg(&kexec_crash_image, NULL));
-			result = kimage_crash_alloc(&image, entry, nr_segments, segments);
+			result = kimage_crash_alloc(&image, entry,
+						     nr_segments, segments);
 		}
-		if (result) {
+		if (result)
 			goto out;
-		}
+
 		result = machine_kexec_prepare(image);
-		if (result) {
+		if (result)
 			goto out;
-		}
-		for(i = 0; i < nr_segments; i++) {
+
+		for (i = 0; i < nr_segments; i++) {
 			result = kimage_load_segment(image, &image->segment[i]);
-			if (result) {
+			if (result)
 				goto out;
-			}
 		}
 		result = kimage_terminate(image);
-		if (result) {
+		if (result)
 			goto out;
-		}
 	}
 	/* Install the new kernel, and  Uninstall the old */
 	image = xchg(dest_image, image);
 
- out:
+out:
 	xchg(&kexec_lock, 0); /* Release the mutex */
 	kimage_free(image);
+
 	return result;
 }
 
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
-	unsigned long nr_segments, struct compat_kexec_segment __user *segments,
-	unsigned long flags)
+				unsigned long nr_segments,
+				struct compat_kexec_segment __user *segments,
+				unsigned long flags)
 {
 	struct compat_kexec_segment in;
 	struct kexec_segment out, __user *ksegments;
@@ -989,20 +1011,17 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 	/* Don't allow clients that don't understand the native
 	 * architecture to do anything.
 	 */
-	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
+	if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
 		return -EINVAL;
-	}
 
-	if (nr_segments > KEXEC_SEGMENT_MAX) {
+	if (nr_segments > KEXEC_SEGMENT_MAX)
 		return -EINVAL;
-	}
 
 	ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
 	for (i=0; i < nr_segments; i++) {
 		result = copy_from_user(&in, &segments[i], sizeof(in));
-		if (result) {
+		if (result)
 			return -EFAULT;
-		}
 
 		out.buf   = compat_ptr(in.buf);
 		out.bufsz = in.bufsz;
@@ -1010,9 +1029,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 		out.memsz = in.memsz;
 
 		result = copy_to_user(&ksegments[i], &out, sizeof(out));
-		if (result) {
+		if (result)
 			return -EFAULT;
-		}
 	}
 
 	return sys_kexec_load(entry, nr_segments, ksegments, flags);
-- 
cgit v1.2.3-59-g8ed1b


From 8c0e33c133021ee241e9d51255b9fb18eb34ef0e Mon Sep 17 00:00:00 2001
From: Nick Wilson <njw@osdl.org>
Date: Sat, 25 Jun 2005 14:59:00 -0700
Subject: [PATCH] Use ALIGN to remove duplicate code

This patch makes use of ALIGN() to remove duplicate round-up code.

Signed-off-by: Nick Wilson <njw@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/a.out.h | 2 +-
 kernel/resource.c     | 2 +-
 lib/bitmap.c          | 3 +--
 mm/bootmem.c          | 6 +++---
 4 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/a.out.h b/include/linux/a.out.h
index af8a1dfa5c32..f913cc3e1b0d 100644
--- a/include/linux/a.out.h
+++ b/include/linux/a.out.h
@@ -138,7 +138,7 @@ enum machine_type {
 #endif
 #endif
 
-#define _N_SEGMENT_ROUND(x) (((x) + SEGMENT_SIZE - 1) & ~(SEGMENT_SIZE - 1))
+#define _N_SEGMENT_ROUND(x) ALIGN(x, SEGMENT_SIZE)
 
 #define _N_TXTENDADDR(x) (N_TXTADDR(x)+(x).a_text)
 
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
 			new->start = min;
 		if (new->end > max)
 			new->end = max;
-		new->start = (new->start + align - 1) & ~(align - 1);
+		new->start = ALIGN(new->start, align);
 		if (alignf)
 			alignf(alignf_data, new, size, align);
 		if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/lib/bitmap.c b/lib/bitmap.c
index d1388a5ce89c..fb9371fdd44a 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -289,7 +289,6 @@ EXPORT_SYMBOL(__bitmap_weight);
 
 #define CHUNKSZ				32
 #define nbits_to_hold_value(val)	fls(val)
-#define roundup_power2(val,modulus)	(((val) + (modulus) - 1) & ~((modulus) - 1))
 #define unhex(c)			(isdigit(c) ? (c - '0') : (toupper(c) - 'A' + 10))
 #define BASEDEC 10		/* fancier cpuset lists input in decimal */
 
@@ -316,7 +315,7 @@ int bitmap_scnprintf(char *buf, unsigned int buflen,
 	if (chunksz == 0)
 		chunksz = CHUNKSZ;
 
-	i = roundup_power2(nmaskbits, CHUNKSZ) - CHUNKSZ;
+	i = ALIGN(nmaskbits, CHUNKSZ) - CHUNKSZ;
 	for (; i >= 0; i -= CHUNKSZ) {
 		chunkmask = ((1ULL << chunksz) - 1);
 		word = i / BITS_PER_LONG;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 45275f1f8947..c1330cc19783 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -65,7 +65,7 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	pgdat->pgdat_next = pgdat_list;
 	pgdat_list = pgdat;
 
-	mapsize = (mapsize + (sizeof(long) - 1UL)) & ~(sizeof(long) - 1UL);
+	mapsize = ALIGN(mapsize, sizeof(long));
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
@@ -186,7 +186,7 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	} else
 		preferred = 0;
 
-	preferred = ((preferred + align - 1) & ~(align - 1)) >> PAGE_SHIFT;
+	preferred = ALIGN(preferred, align) >> PAGE_SHIFT;
 	preferred += offset;
 	areasize = (size+PAGE_SIZE-1)/PAGE_SIZE;
 	incr = align >> PAGE_SHIFT ? : 1;
@@ -227,7 +227,7 @@ found:
 	 */
 	if (align < PAGE_SIZE &&
 	    bdata->last_offset && bdata->last_pos+1 == start) {
-		offset = (bdata->last_offset+align-1) & ~(align-1);
+		offset = ALIGN(bdata->last_offset, align);
 		BUG_ON(offset > PAGE_SIZE);
 		remaining_size = PAGE_SIZE-offset;
 		if (size < remaining_size) {
-- 
cgit v1.2.3-59-g8ed1b


From 681ea4b930768444e9d88651c1362b0bf6d2a42b Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Jun 2005 14:59:04 -0700
Subject: [PATCH] drivers/char/nvram.c: possible cleanups

This patch contains the following possible cleanups:
- make the needlessly global function __nvram_set_checksum static
- #if 0 the unused global function nvram_set_checksum
- remove the EXPORT_SYMBOL's for both functions

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/nvram.c  | 6 +++---
 include/linux/nvram.h | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c
index f63a3fd7ca6f..1af733d07321 100644
--- a/drivers/char/nvram.c
+++ b/drivers/char/nvram.c
@@ -211,12 +211,13 @@ nvram_check_checksum(void)
 	return rv;
 }
 
-void
+static void
 __nvram_set_checksum(void)
 {
 	mach_set_checksum();
 }
 
+#if 0
 void
 nvram_set_checksum(void)
 {
@@ -226,6 +227,7 @@ nvram_set_checksum(void)
 	__nvram_set_checksum();
 	spin_unlock_irqrestore(&rtc_lock, flags);
 }
+#endif  /*  0  */
 
 /*
  * The are the file operation function for user access to /dev/nvram
@@ -921,6 +923,4 @@ EXPORT_SYMBOL(__nvram_write_byte);
 EXPORT_SYMBOL(nvram_write_byte);
 EXPORT_SYMBOL(__nvram_check_checksum);
 EXPORT_SYMBOL(nvram_check_checksum);
-EXPORT_SYMBOL(__nvram_set_checksum);
-EXPORT_SYMBOL(nvram_set_checksum);
 MODULE_ALIAS_MISCDEV(NVRAM_MINOR);
diff --git a/include/linux/nvram.h b/include/linux/nvram.h
index b031e41b5e0d..9189829c131c 100644
--- a/include/linux/nvram.h
+++ b/include/linux/nvram.h
@@ -20,8 +20,6 @@ extern void __nvram_write_byte(unsigned char c, int i);
 extern void nvram_write_byte(unsigned char c, int i);
 extern int __nvram_check_checksum(void);
 extern int nvram_check_checksum(void);
-extern void __nvram_set_checksum(void);
-extern void nvram_set_checksum(void);
 #endif
 
 #endif  /* _LINUX_NVRAM_H */
-- 
cgit v1.2.3-59-g8ed1b


From 93d17d3d84b7147e8f07aeeb15ec01aa92c6b564 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 25 Jun 2005 14:59:10 -0700
Subject: [PATCH] drivers/block/ll_rw_blk.c: cleanups

This patch contains the following cleanups:
- make needlessly global code static
- remove the following unused global functions:
  - blkdev_scsi_issue_flush_fn
  - __blk_attempt_remerge
- remove the following unused EXPORT_SYMBOL's:
  - blk_phys_contig_segment
  - blk_hw_contig_segment
  - blkdev_scsi_issue_flush_fn
  - __blk_attempt_remerge

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 67 ++++++-----------------------------------------
 include/linux/blkdev.h    |  6 -----
 2 files changed, 8 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index fd94ea27d594..fc86d53fe783 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -37,6 +37,7 @@
 
 static void blk_unplug_work(void *data);
 static void blk_unplug_timeout(unsigned long data);
+static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 
 /*
  * For the allocated request tables
@@ -1137,7 +1138,7 @@ new_hw_segment:
 }
 
 
-int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
+static int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
@@ -1158,9 +1159,7 @@ int blk_phys_contig_segment(request_queue_t *q, struct bio *bio,
 	return 0;
 }
 
-EXPORT_SYMBOL(blk_phys_contig_segment);
-
-int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
+static int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
 				 struct bio *nxt)
 {
 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
@@ -1176,8 +1175,6 @@ int blk_hw_contig_segment(request_queue_t *q, struct bio *bio,
 	return 1;
 }
 
-EXPORT_SYMBOL(blk_hw_contig_segment);
-
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
@@ -1825,7 +1822,7 @@ static inline int ioc_batching(request_queue_t *q, struct io_context *ioc)
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
-void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
+static void ioc_set_batching(request_queue_t *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
@@ -2254,45 +2251,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 
 EXPORT_SYMBOL(blkdev_issue_flush);
 
-/**
- * blkdev_scsi_issue_flush_fn - issue flush for SCSI devices
- * @q:		device queue
- * @disk:	gendisk
- * @error_sector:	error offset
- *
- * Description:
- *    Devices understanding the SCSI command set, can use this function as
- *    a helper for issuing a cache flush. Note: driver is required to store
- *    the error offset (in case of error flushing) in ->sector of struct
- *    request.
- */
-int blkdev_scsi_issue_flush_fn(request_queue_t *q, struct gendisk *disk,
-			       sector_t *error_sector)
-{
-	struct request *rq = blk_get_request(q, WRITE, __GFP_WAIT);
-	int ret;
-
-	rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER;
-	rq->sector = 0;
-	memset(rq->cmd, 0, sizeof(rq->cmd));
-	rq->cmd[0] = 0x35;
-	rq->cmd_len = 12;
-	rq->data = NULL;
-	rq->data_len = 0;
-	rq->timeout = 60 * HZ;
-
-	ret = blk_execute_rq(q, disk, rq);
-
-	if (ret && error_sector)
-		*error_sector = rq->sector;
-
-	blk_put_request(rq);
-	return ret;
-}
-
-EXPORT_SYMBOL(blkdev_scsi_issue_flush_fn);
-
-void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
+static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
 
@@ -2551,16 +2510,6 @@ void blk_attempt_remerge(request_queue_t *q, struct request *rq)
 
 EXPORT_SYMBOL(blk_attempt_remerge);
 
-/*
- * Non-locking blk_attempt_remerge variant.
- */
-void __blk_attempt_remerge(request_queue_t *q, struct request *rq)
-{
-	attempt_back_merge(q, rq);
-}
-
-EXPORT_SYMBOL(__blk_attempt_remerge);
-
 static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
@@ -2971,7 +2920,7 @@ void submit_bio(int rw, struct bio *bio)
 
 EXPORT_SYMBOL(submit_bio);
 
-void blk_recalc_rq_segments(struct request *rq)
+static void blk_recalc_rq_segments(struct request *rq)
 {
 	struct bio *bio, *prevbio = NULL;
 	int nr_phys_segs, nr_hw_segs;
@@ -3013,7 +2962,7 @@ void blk_recalc_rq_segments(struct request *rq)
 	rq->nr_hw_segments = nr_hw_segs;
 }
 
-void blk_recalc_rq_sectors(struct request *rq, int nsect)
+static void blk_recalc_rq_sectors(struct request *rq, int nsect)
 {
 	if (blk_fs_request(rq)) {
 		rq->hard_sector += nsect;
@@ -3601,7 +3550,7 @@ static struct sysfs_ops queue_sysfs_ops = {
 	.store	= queue_attr_store,
 };
 
-struct kobj_type queue_ktype = {
+static struct kobj_type queue_ktype = {
 	.sysfs_ops	= &queue_sysfs_ops,
 	.default_attrs	= default_attrs,
 };
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 60272141ff19..b54a0348a890 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -539,15 +539,12 @@ extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void blk_end_sync_rq(struct request *rq);
 extern void blk_attempt_remerge(request_queue_t *, struct request *);
-extern void __blk_attempt_remerge(request_queue_t *, struct request *);
 extern struct request *blk_get_request(request_queue_t *, int, int);
 extern void blk_insert_request(request_queue_t *, struct request *, int, void *);
 extern void blk_requeue_request(request_queue_t *, struct request *);
 extern void blk_plug_device(request_queue_t *);
 extern int blk_remove_plug(request_queue_t *);
 extern void blk_recount_segments(request_queue_t *, struct bio *);
-extern int blk_phys_contig_segment(request_queue_t *q, struct bio *, struct bio *);
-extern int blk_hw_contig_segment(request_queue_t *q, struct bio *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct gendisk *, unsigned int, void __user *);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
@@ -631,7 +628,6 @@ extern void blk_queue_dma_alignment(request_queue_t *, int);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern void blk_queue_ordered(request_queue_t *, int);
 extern void blk_queue_issue_flush_fn(request_queue_t *, issue_flush_fn *);
-extern int blkdev_scsi_issue_flush_fn(request_queue_t *, struct gendisk *, sector_t *);
 extern struct request *blk_start_pre_flush(request_queue_t *,struct request *);
 extern int blk_complete_barrier_rq(request_queue_t *, struct request *, int);
 extern int blk_complete_barrier_rq_locked(request_queue_t *, struct request *, int);
@@ -675,8 +671,6 @@ extern int blkdev_issue_flush(struct block_device *, sector_t *);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
-extern void drive_stat_acct(struct request *, int, int);
-
 static inline int queue_hardsect_size(request_queue_t *q)
 {
 	int retval = 512;
-- 
cgit v1.2.3-59-g8ed1b


From 3e1d1d28d99dabe63c64f7f40f1ca1d646de1f73 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Fri, 24 Jun 2005 23:13:50 -0700
Subject: [PATCH] Cleanup patch for process freezing

1. Establish a simple API for process freezing defined in linux/include/sched.h:

   frozen(process)		Check for frozen process
   freezing(process)		Check if a process is being frozen
   freeze(process)		Tell a process to freeze (go to refrigerator)
   thaw_process(process)	Restart process
   frozen_process(process)	Process is frozen now

2. Remove all references to PF_FREEZE and PF_FROZEN from all
   kernel sources except sched.h

3. Fix numerous locations where try_to_freeze is manually done by a driver

4. Remove the argument that is no longer necessary from two function calls.

5. Some whitespace cleanup

6. Clear potential race in refrigerator (provides an open window of PF_FREEZE
   cleared before setting PF_FROZEN, recalc_sigpending does not check
   PF_FROZEN).

This patch does not address the problem of freeze_processes() violating the rule
that a task may only modify its own flags by setting PF_FREEZE. This is not clean
in an SMP environment. freeze(process) is therefore not SMP safe!

Signed-off-by: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/kernel_threads.txt    |  3 +-
 Documentation/power/swsusp.txt            |  3 +-
 arch/frv/kernel/signal.c                  |  4 +-
 arch/h8300/kernel/signal.c                |  4 +-
 arch/i386/kernel/io_apic.c                |  2 +-
 arch/i386/kernel/signal.c                 |  4 +-
 arch/m32r/kernel/signal.c                 |  4 +-
 arch/ppc/kernel/signal.c                  |  3 +-
 arch/x86_64/kernel/signal.c               |  2 +-
 drivers/block/pktcdvd.c                   |  3 +-
 drivers/ieee1394/ieee1394_core.c          |  4 +-
 drivers/ieee1394/nodemgr.c                |  2 +-
 drivers/input/gameport/gameport.c         |  2 +-
 drivers/input/serio/serio.c               |  2 +-
 drivers/macintosh/therm_adt746x.c         |  4 +-
 drivers/md/md.c                           |  3 +-
 drivers/media/dvb/dvb-core/dvb_frontend.c |  3 +-
 drivers/media/video/msp3400.c             |  3 +-
 drivers/media/video/video-buf-dvb.c       |  3 +-
 drivers/net/8139too.c                     |  2 +-
 drivers/net/irda/sir_kthread.c            |  3 +-
 drivers/net/irda/stir4200.c               |  4 +-
 drivers/net/wireless/airo.c               |  2 +-
 drivers/pcmcia/cs.c                       |  2 +-
 drivers/pnp/pnpbios/core.c                |  2 +-
 drivers/usb/core/hub.c                    |  2 +-
 drivers/usb/gadget/file_storage.c         |  3 +-
 drivers/usb/storage/usb.c                 |  4 +-
 drivers/w1/w1.c                           |  4 +-
 fs/afs/kafsasyncd.c                       |  2 +-
 fs/afs/kafstimod.c                        |  2 +-
 fs/jbd/journal.c                          |  4 +-
 fs/jfs/jfs_logmgr.c                       |  4 +-
 fs/jfs/jfs_txnmgr.c                       |  8 ++--
 fs/lockd/clntproc.c                       |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c                |  4 +-
 fs/xfs/linux-2.6/xfs_super.c              |  2 +-
 include/linux/sched.h                     | 73 +++++++++++++++++++++++++------
 kernel/power/process.c                    | 26 +++++------
 kernel/sched.c                            |  3 +-
 kernel/signal.c                           |  5 +--
 mm/pdflush.c                              |  2 +-
 mm/vmscan.c                               |  4 +-
 net/rxrpc/krxiod.c                        |  2 +-
 net/rxrpc/krxsecd.c                       |  2 +-
 net/rxrpc/krxtimod.c                      |  2 +-
 net/sunrpc/svcsock.c                      |  6 +--
 47 files changed, 126 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/power/kernel_threads.txt b/Documentation/power/kernel_threads.txt
index 60b548105edf..fb57784986b1 100644
--- a/Documentation/power/kernel_threads.txt
+++ b/Documentation/power/kernel_threads.txt
@@ -12,8 +12,7 @@ refrigerator. Code to do this looks like this:
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list));
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 from drivers/usb/core/hub.c::hub_thread()
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index c7c3459fde43..4e1627cc5b51 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -164,8 +164,7 @@ place where the thread is safe to be frozen (no kernel semaphores
 should be held at that point and it must be safe to sleep there), and
 add:
 
-            if (current->flags & PF_FREEZE)
-                    refrigerator(PF_FREEZE);
+            try_to_freeze();
 
 If the thread is needed for writing the image to storage, you should
 instead set the PF_NOFREEZE process flag when creating the thread.
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index d8d8f3d4304d..36a2dffc8ebd 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -536,10 +536,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze())
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c
index a4799d633ef4..5aab87eae1f9 100644
--- a/arch/h8300/kernel/signal.c
+++ b/arch/h8300/kernel/signal.c
@@ -517,10 +517,8 @@ asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if ((regs->ccr & 0x10))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze())
 		goto no_signal;
-	}
 
 	current->thread.esp0 = (unsigned long) regs;
 
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 08540bc4ba3e..2451a3a99440 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -573,7 +573,7 @@ static int balanced_irq(void *unused)
 	for ( ; ; ) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		time_remaining = schedule_timeout(time_remaining);
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (time_after(jiffies,
 				prev_balance_time+balanced_irq_interval)) {
 			do_irq_balance();
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index b9b8f4e20fad..ac5b1e975c5c 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -608,10 +608,8 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze)
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c
index 50311eb07a24..5aef7e406ef5 100644
--- a/arch/m32r/kernel/signal.c
+++ b/arch/m32r/kernel/signal.c
@@ -371,10 +371,8 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(0);
+	if (try_to_freeze()) 
 		goto no_signal;
-	}
 
 	if (!oldset)
 		oldset = &current->blocked;
diff --git a/arch/ppc/kernel/signal.c b/arch/ppc/kernel/signal.c
index 7c8437da09d5..8aaeb6f4e750 100644
--- a/arch/ppc/kernel/signal.c
+++ b/arch/ppc/kernel/signal.c
@@ -705,8 +705,7 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs)
 	unsigned long frame, newsp;
 	int signr, ret;
 
-	if (current->flags & PF_FREEZE) {
-		refrigerator(PF_FREEZE);
+	if (try_to_freeze()) {
 		signr = 0;
 		if (!signal_pending(current))
 			goto no_signal;
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b7ba95d581..98590a989f3d 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -425,7 +425,7 @@ int do_signal(struct pt_regs *regs, sigset_t *oldset)
 	if (!user_mode(regs))
 		return 1;
 
-	if (try_to_freeze(0))
+	if (try_to_freeze())
 		goto no_signal;
 
 	if (!oldset)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7f3d78de265c..7b838342f0a3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -1251,8 +1251,7 @@ static int kcdrwd(void *foobar)
 			VPRINTK("kcdrwd: wake up\n");
 
 			/* make swsusp happy with our thread */
-			if (current->flags & PF_FREEZE)
-				refrigerator(PF_FREEZE);
+			try_to_freeze();
 
 			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
 				if (!pkt->sleep_time)
diff --git a/drivers/ieee1394/ieee1394_core.c b/drivers/ieee1394/ieee1394_core.c
index 2d9a9b74e687..629070b83a33 100644
--- a/drivers/ieee1394/ieee1394_core.c
+++ b/drivers/ieee1394/ieee1394_core.c
@@ -1041,10 +1041,8 @@ static int hpsbpkt_thread(void *__hi)
 
 	while (1) {
 		if (down_interruptible(&khpsbpkt_sig)) {
-			if (current->flags & PF_FREEZE) {
-				refrigerator(0);
+			if (try_to_freeze())
 				continue;
-			}
 			printk("khpsbpkt: received unexpected signal?!\n" );
 			break;
 		}
diff --git a/drivers/ieee1394/nodemgr.c b/drivers/ieee1394/nodemgr.c
index 32abb6dda888..9a46c3b44bf8 100644
--- a/drivers/ieee1394/nodemgr.c
+++ b/drivers/ieee1394/nodemgr.c
@@ -1510,7 +1510,7 @@ static int nodemgr_host_thread(void *__hi)
 
 		if (down_interruptible(&hi->reset_sem) ||
 		    down_interruptible(&nodemgr_serialize)) {
-			if (try_to_freeze(PF_FREEZE))
+			if (try_to_freeze())
 				continue;
 			printk("NodeMgr: received unexpected signal?!\n" );
 			break;
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e152d0fa0cdd..c77a82e46055 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -439,7 +439,7 @@ static int gameport_thread(void *nothing)
 	do {
 		gameport_handle_events();
 		wait_event_interruptible(gameport_wait, !list_empty(&gameport_event_list));
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	printk(KERN_DEBUG "gameport: kgameportd exiting\n");
diff --git a/drivers/input/serio/serio.c b/drivers/input/serio/serio.c
index feab4970406e..341824c48529 100644
--- a/drivers/input/serio/serio.c
+++ b/drivers/input/serio/serio.c
@@ -344,7 +344,7 @@ static int serio_thread(void *nothing)
 	do {
 		serio_handle_events();
 		wait_event_interruptible(serio_wait, !list_empty(&serio_event_list));
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	printk(KERN_DEBUG "serio: kseriod exiting\n");
diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c
index 5ba190ce14a0..c9ca1118e449 100644
--- a/drivers/macintosh/therm_adt746x.c
+++ b/drivers/macintosh/therm_adt746x.c
@@ -328,9 +328,7 @@ static int monitor_task(void *arg)
 	struct thermostat* th = arg;
 
 	while(!kthread_should_stop()) {
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
-
+		try_to_freeze();
 		msleep_interruptible(2000);
 
 #ifndef DEBUG
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0c6b5b6baff6..3802f7a17f16 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2976,8 +2976,7 @@ static int md_thread(void * arg)
 		wait_event_interruptible_timeout(thread->wqueue,
 						 test_bit(THREAD_WAKEUP, &thread->flags),
 						 thread->timeout);
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
diff --git a/drivers/media/dvb/dvb-core/dvb_frontend.c b/drivers/media/dvb/dvb-core/dvb_frontend.c
index d6b7a9de471e..f11daae91cd4 100644
--- a/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -391,8 +391,7 @@ static int dvb_frontend_thread(void *data)
 			break;
 		}
 
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		if (down_interruptible(&fepriv->sem))
 			break;
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 1b7d38e96f14..b4ee9dfe6d42 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -750,8 +750,7 @@ static int msp34xx_sleep(struct msp3400c *msp, int timeout)
 #endif
 		}
 	}
-	if (current->flags & PF_FREEZE)
-		refrigerator(PF_FREEZE);
+	try_to_freeze();
 	remove_wait_queue(&msp->wq, &wait);
 	return msp->restart;
 }
diff --git a/drivers/media/video/video-buf-dvb.c b/drivers/media/video/video-buf-dvb.c
index 5f870075b55e..15f5bb486963 100644
--- a/drivers/media/video/video-buf-dvb.c
+++ b/drivers/media/video/video-buf-dvb.c
@@ -62,8 +62,7 @@ static int videobuf_dvb_thread(void *data)
 			break;
 		if (kthread_should_stop())
 			break;
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		/* feed buffer data to demux */
 		if (buf->state == STATE_DONE)
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 047202c4d9a8..5a4a08a7c951 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -1606,7 +1606,7 @@ static int rtl8139_thread (void *data)
 		do {
 			timeout = interruptible_sleep_on_timeout (&tp->thr_wait, timeout);
 			/* make swsusp happy with our thread */
-			try_to_freeze(PF_FREEZE);
+			try_to_freeze();
 		} while (!signal_pending (current) && (timeout > 0));
 
 		if (signal_pending (current)) {
diff --git a/drivers/net/irda/sir_kthread.c b/drivers/net/irda/sir_kthread.c
index 18cea1099530..c65054364bca 100644
--- a/drivers/net/irda/sir_kthread.c
+++ b/drivers/net/irda/sir_kthread.c
@@ -135,8 +135,7 @@ static int irda_thread(void *startup)
 		remove_wait_queue(&irda_rq_queue.kick, &wait);
 
 		/* make swsusp happy with our thread */
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		run_irda_queue();
 	}
diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
index 66f488c13717..15f207323d97 100644
--- a/drivers/net/irda/stir4200.c
+++ b/drivers/net/irda/stir4200.c
@@ -763,7 +763,7 @@ static int stir_transmit_thread(void *arg)
 	{
 #ifdef CONFIG_PM
 		/* if suspending, then power off and wait */
-		if (unlikely(current->flags & PF_FREEZE)) {
+		if (unlikely(freezing(current))) {
 			if (stir->receiving)
 				receive_stop(stir);
 			else
@@ -771,7 +771,7 @@ static int stir_transmit_thread(void *arg)
 
 			write_reg(stir, REG_CTRL1, CTRL1_TXPWD|CTRL1_RXPWD);
 
-			refrigerator(PF_FREEZE);
+			refrigerator();
 
 			if (change_speed(stir, stir->speed))
 				break;
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index fb10a2db63ad..d72e0385e4f2 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2918,7 +2918,7 @@ static int airo_thread(void *data) {
 			flush_signals(current);
 
 		/* make swsusp happy with our thread */
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		if (test_bit(JOB_DIE, &ai->flags))
 			break;
diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c
index d136b3c8fac9..48e4f04530d8 100644
--- a/drivers/pcmcia/cs.c
+++ b/drivers/pcmcia/cs.c
@@ -718,7 +718,7 @@ static int pccardd(void *__skt)
 		}
 
 		schedule();
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		if (!skt->thread)
 			break;
diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c
index e939c93a931c..778a324028f4 100644
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -182,7 +182,7 @@ static int pnp_dock_thread(void * unused)
 		msleep_interruptible(2000);
 
 		if(signal_pending(current)) {
-			if (try_to_freeze(PF_FREEZE))
+			if (try_to_freeze())
 				continue;
 			break;
 		}
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index d2d648ee8640..a8d879a85d04 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2808,7 +2808,7 @@ static int hub_thread(void *__unused)
 	do {
 		hub_events();
 		wait_event_interruptible(khubd_wait, !list_empty(&hub_event_list)); 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 	} while (!signal_pending(current));
 
 	pr_debug ("%s: khubd exiting\n", usbcore_name);
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index 037a7f163822..a9be85103d23 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -1554,8 +1554,7 @@ static int sleep_thread(struct fsg_dev *fsg)
 	rc = wait_event_interruptible(fsg->thread_wqh,
 			fsg->thread_wakeup_needed);
 	fsg->thread_wakeup_needed = 0;
-	if (current->flags & PF_FREEZE)
-		refrigerator(PF_FREEZE);
+	try_to_freeze();
 	return (rc ? -EINTR : 0);
 }
 
diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
index 35c1ca6b5a8e..77e7fc258aa2 100644
--- a/drivers/usb/storage/usb.c
+++ b/drivers/usb/storage/usb.c
@@ -847,10 +847,8 @@ retry:
 		wait_event_interruptible_timeout(us->delay_wait,
 				test_bit(US_FLIDX_DISCONNECTING, &us->flags),
 				delay_use * HZ);
-		if (current->flags & PF_FREEZE) {
-			refrigerator(PF_FREEZE);
+		if (try_to_freeze())
 			goto retry;
-		}
 	}
 
 	/* If the device is still connected, perform the scanning */
diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index b460927ec32a..312cf3220f12 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -646,7 +646,7 @@ static int w1_control(void *data)
 	while (!control_needs_exit || have_to_wait) {
 		have_to_wait = 0;
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		msleep_interruptible(w1_timeout * 1000);
 
 		if (signal_pending(current))
@@ -725,7 +725,7 @@ int w1_process(void *data)
 	allow_signal(SIGTERM);
 
 	while (!test_bit(W1_MASTER_NEED_EXIT, &dev->flags)) {
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		msleep_interruptible(w1_timeout * 1000);
 
 		if (signal_pending(current))
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 6fc88ae8ad94..7ac07d0d47b9 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -116,7 +116,7 @@ static int kafsasyncd(void *arg)
 		remove_wait_queue(&kafsasyncd_sleepq, &myself);
 		set_current_state(TASK_RUNNING);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		afs_discard_my_signals();
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
index 86e710dd057e..65bc05ab8182 100644
--- a/fs/afs/kafstimod.c
+++ b/fs/afs/kafstimod.c
@@ -91,7 +91,7 @@ static int kafstimod(void *arg)
 			complete_and_exit(&kafstimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		afs_discard_my_signals();
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 1e6f2e2ad4a3..5e7b43949517 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -167,7 +167,7 @@ loop:
 	}
 
 	wake_up(&journal->j_wait_done_commit);
-	if (current->flags & PF_FREEZE) {
+	if (freezing(current)) {
 		/*
 		 * The simpler the better. Flushing journal isn't a
 		 * good idea, because that depends on threads that may
@@ -175,7 +175,7 @@ loop:
 		 */
 		jbd_debug(1, "Now suspending kjournald\n");
 		spin_unlock(&journal->j_state_lock);
-		refrigerator(PF_FREEZE);
+		refrigerator();
 		spin_lock(&journal->j_state_lock);
 	} else {
 		/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 7c8387ed4192..79d07624bfe1 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2359,9 +2359,9 @@ int jfsIOWait(void *arg)
 			lbmStartIO(bp);
 			spin_lock_irq(&log_redrive_lock);
 		}
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			spin_unlock_irq(&log_redrive_lock);
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			add_wait_queue(&jfs_IO_thread_wait, &wq);
 			set_current_state(TASK_INTERRUPTIBLE);
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index 8cbaaff1d5fa..121c981ff453 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2788,9 +2788,9 @@ int jfs_lazycommit(void *arg)
 		/* In case a wakeup came while all threads were active */
 		jfs_commit_thread_waking = 0;
 
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			LAZY_UNLOCK(flags);
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
@@ -2987,9 +2987,9 @@ int jfs_sync(void *arg)
 		/* Add anon_list2 back to anon_list */
 		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
 
-		if (current->flags & PF_FREEZE) {
+		if (freezing(current)) {
 			TXN_UNLOCK();
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fd77ed1d710d..14b3ce87fa29 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -313,7 +313,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue)
 	prepare_to_wait(queue, &wait, TASK_INTERRUPTIBLE);
 	if (!signalled ()) {
 		schedule_timeout(NLMCLNT_GRACE_WAIT);
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (!signalled ())
 			status = 0;
 	}
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index c60e69431e11..df0cba239dd5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1771,9 +1771,9 @@ xfsbufd(
 
 	INIT_LIST_HEAD(&tmp);
 	do {
-		if (unlikely(current->flags & PF_FREEZE)) {
+		if (unlikely(freezing(current))) {
 			xfsbufd_force_sleep = 1;
-			refrigerator(PF_FREEZE);
+			refrigerator();
 		} else {
 			xfsbufd_force_sleep = 0;
 		}
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 5fe9af38aa20..f6dd7de25927 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -483,7 +483,7 @@ xfssyncd(
 		set_current_state(TASK_INTERRUPTIBLE);
 		timeleft = schedule_timeout(timeleft);
 		/* swsusp */
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 		if (vfsp->vfs_flag & VFS_UMOUNT)
 			break;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c69682b0444..e7fd09b0557f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1245,33 +1245,78 @@ extern void normalize_rt_tasks(void);
 
 #endif
 
-/* try_to_freeze
- *
- * Checks whether we need to enter the refrigerator
- * and returns 1 if we did so.
- */
 #ifdef CONFIG_PM
-extern void refrigerator(unsigned long);
+/*
+ * Check if a process has been frozen
+ */
+static inline int frozen(struct task_struct *p)
+{
+	return p->flags & PF_FROZEN;
+}
+
+/*
+ * Check if there is a request to freeze a process
+ */
+static inline int freezing(struct task_struct *p)
+{
+	return p->flags & PF_FREEZE;
+}
+
+/*
+ * Request that a process be frozen
+ * FIXME: SMP problem. We may not modify other process' flags!
+ */
+static inline void freeze(struct task_struct *p)
+{
+	p->flags |= PF_FREEZE;
+}
+
+/*
+ * Wake up a frozen process
+ */
+static inline int thaw_process(struct task_struct *p)
+{
+	if (frozen(p)) {
+		p->flags &= ~PF_FROZEN;
+		wake_up_process(p);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * freezing is complete, mark process as frozen
+ */
+static inline void frozen_process(struct task_struct *p)
+{
+	p->flags = (p->flags & ~PF_FREEZE) | PF_FROZEN;
+}
+
+extern void refrigerator(void);
 extern int freeze_processes(void);
 extern void thaw_processes(void);
 
-static inline int try_to_freeze(unsigned long refrigerator_flags)
+static inline int try_to_freeze(void)
 {
-	if (unlikely(current->flags & PF_FREEZE)) {
-		refrigerator(refrigerator_flags);
+	if (freezing(current)) {
+		refrigerator();
 		return 1;
 	} else
 		return 0;
 }
 #else
-static inline void refrigerator(unsigned long flag) {}
+static inline int frozen(struct task_struct *p) { return 0; }
+static inline int freezing(struct task_struct *p) { return 0; }
+static inline void freeze(struct task_struct *p) { BUG(); }
+static inline int thaw_process(struct task_struct *p) { return 1; }
+static inline void frozen_process(struct task_struct *p) { BUG(); }
+
+static inline void refrigerator(void) {}
 static inline int freeze_processes(void) { BUG(); return 0; }
 static inline void thaw_processes(void) {}
 
-static inline int try_to_freeze(unsigned long refrigerator_flags)
-{
-	return 0;
-}
+static inline int try_to_freeze(void) { return 0; }
+
 #endif /* CONFIG_PM */
 #endif /* __KERNEL__ */
 
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(unsigned long flag)
+void refrigerator(void)
 {
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
 	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
-	current->flags &= ~PF_FREEZE;
 
+	frozen_process(current);
 	spin_lock_irq(&current->sighand->siglock);
 	recalc_sigpending(); /* We sent fake signal, clean it up */
 	spin_unlock_irq(&current->sighand->siglock);
 
-	current->flags |= PF_FROZEN;
-	while (current->flags & PF_FROZEN)
+	while (frozen(current))
 		schedule();
 	pr_debug("%s left refrigerator\n", current->comm);
 	current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
 /* 0 = success, else # of processes that we failed to stop */
 int freeze_processes(void)
 {
-       int todo;
-       unsigned long start_time;
+	int todo;
+	unsigned long start_time;
 	struct task_struct *g, *p;
-	
+
 	printk( "Stopping tasks: " );
 	start_time = jiffies;
 	do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
 			unsigned long flags;
 			if (!freezeable(p))
 				continue;
-			if ((p->flags & PF_FROZEN) ||
+			if ((frozen(p)) ||
 			    (p->state == TASK_TRACED) ||
 			    (p->state == TASK_STOPPED))
 				continue;
 
-			/* FIXME: smp problem here: we may not access other process' flags
-			   without locking */
-			p->flags |= PF_FREEZE;
+			freeze(p);
 			spin_lock_irqsave(&p->sighand->siglock, flags);
 			signal_wake_up(p, 0);
 			spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
 			return todo;
 		}
 	} while(todo);
-	
+
 	printk( "|\n" );
 	BUG_ON(in_atomic());
 	return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
 	do_each_thread(g, p) {
 		if (!freezeable(p))
 			continue;
-		if (p->flags & PF_FROZEN) {
-			p->flags &= ~PF_FROZEN;
-			wake_up_process(p);
-		} else
+		if (!thaw_process(p))
 			printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
 	} while_each_thread(g, p);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 76080d142e3d..6fa9ea4ae44c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4174,8 +4174,7 @@ static int migration_thread(void * data)
 		struct list_head *head;
 		migration_req_t *req;
 
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_irq(&rq->lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index d1258729a5f9..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -213,7 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 fastcall void recalc_sigpending_tsk(struct task_struct *t)
 {
 	if (t->signal->group_stop_count > 0 ||
-	    (t->flags & PF_FREEZE) ||
+	    (freezing(t)) ||
 	    PENDING(&t->pending, &t->blocked) ||
 	    PENDING(&t->signal->shared_pending, &t->blocked))
 		set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -2231,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 			current->state = TASK_INTERRUPTIBLE;
 			timeout = schedule_timeout(timeout);
 
-			if (current->flags & PF_FREEZE)
-				refrigerator(PF_FREEZE);
+			try_to_freeze();
 			spin_lock_irq(&current->sighand->siglock);
 			sig = dequeue_signal(current, &these, &info);
 			current->blocked = current->real_blocked;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 38ce279cc8cd..d6781951267e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -105,7 +105,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		spin_unlock_irq(&pdflush_lock);
 
 		schedule();
-		if (try_to_freeze(PF_FREEZE)) {
+		if (try_to_freeze()) {
 			spin_lock_irq(&pdflush_lock);
 			continue;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4b8e62a19370..1fa312a8db77 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1216,8 +1216,8 @@ static int kswapd(void *p)
 	order = 0;
 	for ( ; ; ) {
 		unsigned long new_order;
-		if (current->flags & PF_FREEZE)
-			refrigerator(PF_FREEZE);
+
+		try_to_freeze();
 
 		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
 		new_order = pgdat->kswapd_max_order;
diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c
index 2b537f425a17..dada34a77b21 100644
--- a/net/rxrpc/krxiod.c
+++ b/net/rxrpc/krxiod.c
@@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg)
 
 		_debug("### End Work");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c
index 6020c89d9228..1aadd026d354 100644
--- a/net/rxrpc/krxsecd.c
+++ b/net/rxrpc/krxsecd.c
@@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg)
 
 		_debug("### End Inbound Calls");
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
                 /* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c
index 249c2b0290bb..3ac81cdd1211 100644
--- a/net/rxrpc/krxtimod.c
+++ b/net/rxrpc/krxtimod.c
@@ -90,7 +90,7 @@ static int krxtimod(void *arg)
 			complete_and_exit(&krxtimod_dead, 0);
 		}
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		/* discard pending signals */
 		rxrpc_discard_my_signals();
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 05907035bc96..56db8f13e6cb 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	arg->page_len = (pages-2)*PAGE_SIZE;
 	arg->len = (pages-1)*PAGE_SIZE;
 	arg->tail[0].iov_len = 0;
-	
-	try_to_freeze(PF_FREEZE);
+
+	try_to_freeze();
 	if (signalled())
 		return -EINTR;
 
@@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 
 		schedule_timeout(timeout);
 
-		try_to_freeze(PF_FREEZE);
+		try_to_freeze();
 
 		spin_lock_bh(&serv->sv_lock);
 		remove_wait_queue(&rqstp->rq_wait, &wait);
-- 
cgit v1.2.3-59-g8ed1b


From 169a3e66637c667b43dab7c319ffd5c99804cad8 Mon Sep 17 00:00:00 2001
From: Jay Vosburgh <fubar@us.ibm.com>
Date: Sun, 26 Jun 2005 17:54:11 -0400
Subject: bonding: xor/802.3ad improved slave hash

Add support for alternate slave selection algorithms to bonding
balance-xor and 802.3ad modes.  Default mode (what we have now: xor of
MAC addresses) is "layer2", new choice is "layer3+4", using IP and port
information for hashing to select peer.

Originally submitted by Jason Gabler for balance-xor mode;
modified by Jay Vosburgh to additionally support 802.3ad mode.  Jason's
original comment is as follows:

The attached patch to the Linux Etherchannel Bonding driver modifies the
driver's "balance-xor" mode as follows:

      - alternate hashing policy support for mode 2
        * Added kernel parameter "xmit_policy" to allow the specification
          of different hashing policies for mode 2.  The original mode 2
          policy is the default, now found in xmit_hash_policy_layer2().
        * Added xmit_hash_policy_layer34()

This patch was inspired by hashing policies implemented by Cisco,
Foundry and IBM, which are explained in
Foundry documentation found at:
http://www.foundrynet.com/services/documentation/sribcg/Trunking.html#112750

Signed-off-by: Jason Gabler <jygabler@lbl.gov>
Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
---
 drivers/net/bonding/bond_3ad.c  |   3 +-
 drivers/net/bonding/bond_main.c | 107 ++++++++++++++++++++++++++++++++++++----
 drivers/net/bonding/bonding.h   |  10 +++-
 include/linux/if_bonding.h      |   7 +++
 4 files changed, 114 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 6233c4ffb805..a2e8dda5afac 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2346,7 +2346,6 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 {
 	struct slave *slave, *start_at;
 	struct bonding *bond = dev->priv;
-	struct ethhdr *data = (struct ethhdr *)skb->data;
 	int slave_agg_no;
 	int slaves_in_agg;
 	int agg_id;
@@ -2377,7 +2376,7 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 		goto out;
 	}
 
-	slave_agg_no = (data->h_dest[5]^bond->dev->dev_addr[5]) % slaves_in_agg;
+	slave_agg_no = bond->xmit_hash_policy(skb, dev, slaves_in_agg);
 
 	bond_for_each_slave(bond, slave, i) {
 		struct aggregator *agg = SLAVE_AD_INFO(slave).port.aggregator;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 545f6fe025a8..2c930da90a85 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -479,6 +479,14 @@
  * 	- Support for generating gratuitous ARPs in active-backup mode.
  * 	  Includes support for VLAN tagging all bonding-generated ARPs
  * 	  as needed.  Set version to 2.6.2.
+ * 2005/06/08 - Jason Gabler <jygabler at lbl dot gov>
+ *	- alternate hashing policy support for mode 2
+ *	  * Added kernel parameter "xmit_hash_policy" to allow the selection
+ *	    of different hashing policies for mode 2.  The original mode 2
+ *	    policy is the default, now found in xmit_hash_policy_layer2().
+ *	  * Added xmit_hash_policy_layer34()
+ *	- Modified by Jay Vosburgh <fubar@us.ibm.com> to also support mode 4.
+ *	  Set version to 2.6.3.
  */
 
 //#define BONDING_DEBUG 1
@@ -493,7 +501,10 @@
 #include <linux/ptrace.h>
 #include <linux/ioport.h>
 #include <linux/in.h>
+#include <net/ip.h>
 #include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/init.h>
@@ -541,6 +552,7 @@ static int use_carrier	= 1;
 static char *mode	= NULL;
 static char *primary	= NULL;
 static char *lacp_rate	= NULL;
+static char *xmit_hash_policy = NULL;
 static int arp_interval = BOND_LINK_ARP_INTERV;
 static char *arp_ip_target[BOND_MAX_ARP_TARGETS] = { NULL, };
 
@@ -560,6 +572,8 @@ module_param(primary, charp, 0);
 MODULE_PARM_DESC(primary, "Primary network device to use");
 module_param(lacp_rate, charp, 0);
 MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner (slow/fast)");
+module_param(xmit_hash_policy, charp, 0);
+MODULE_PARM_DESC(xmit_hash_policy, "XOR hashing method : 0 for layer 2 (default), 1 for layer 3+4");
 module_param(arp_interval, int, 0);
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
@@ -579,6 +593,7 @@ static struct proc_dir_entry *bond_proc_dir = NULL;
 static u32 arp_target[BOND_MAX_ARP_TARGETS] = { 0, } ;
 static int arp_ip_count	= 0;
 static int bond_mode	= BOND_MODE_ROUNDROBIN;
+static int xmit_hashtype= BOND_XMIT_POLICY_LAYER2;
 static int lacp_fast	= 0;
 static int app_abi_ver	= 0;
 static int orig_app_abi_ver = -1; /* This is used to save the first ABI version
@@ -588,7 +603,6 @@ static int orig_app_abi_ver = -1; /* This is used to save the first ABI version
 				   * command comes from an application using
 				   * another ABI version.
 				   */
-
 struct bond_parm_tbl {
 	char *modename;
 	int mode;
@@ -611,9 +625,15 @@ static struct bond_parm_tbl bond_mode_tbl[] = {
 {	NULL,			-1},
 };
 
+static struct bond_parm_tbl xmit_hashtype_tbl[] = {
+{	"layer2",		BOND_XMIT_POLICY_LAYER2},
+{	"layer3+4",		BOND_XMIT_POLICY_LAYER34},
+{	NULL,			-1},
+};
+
 /*-------------------------- Forward declarations ---------------------------*/
 
-static inline void bond_set_mode_ops(struct net_device *bond_dev, int mode);
+static inline void bond_set_mode_ops(struct bonding *bond, int mode);
 static void bond_send_gratuitous_arp(struct bonding *bond);
 
 /*---------------------------- General routines -----------------------------*/
@@ -3724,6 +3744,46 @@ static void bond_unregister_lacpdu(struct bonding *bond)
 	dev_remove_pack(&(BOND_AD_INFO(bond).ad_pkt_type));
 }
 
+/*---------------------------- Hashing Policies -----------------------------*/
+
+/*
+ * Hash for the the output device based upon layer 3 and layer 4 data. If
+ * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
+ * altogether not IP, mimic bond_xmit_hash_policy_l2()
+ */
+static int bond_xmit_hash_policy_l34(struct sk_buff *skb,
+				    struct net_device *bond_dev, int count)
+{
+	struct ethhdr *data = (struct ethhdr *)skb->data;
+	struct iphdr *iph = skb->nh.iph;
+	u16 *layer4hdr = (u16 *)((u32 *)iph + iph->ihl);
+	int layer4_xor = 0;
+
+	if (skb->protocol == __constant_htons(ETH_P_IP)) {
+		if (!(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) &&
+		    (iph->protocol == IPPROTO_TCP ||
+		     iph->protocol == IPPROTO_UDP)) {
+			layer4_xor = htons((*layer4hdr ^ *(layer4hdr + 1)));
+		}
+		return (layer4_xor ^
+			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
+
+	}
+
+	return (data->h_dest[5] ^ bond_dev->dev_addr[5]) % count;
+}
+
+/*
+ * Hash for the output device based upon layer 2 data
+ */
+static int bond_xmit_hash_policy_l2(struct sk_buff *skb,
+				   struct net_device *bond_dev, int count)
+{
+	struct ethhdr *data = (struct ethhdr *)skb->data;
+
+	return (data->h_dest[5] ^ bond_dev->dev_addr[5]) % count;
+}
+
 /*-------------------------- Device entry points ----------------------------*/
 
 static int bond_open(struct net_device *bond_dev)
@@ -4310,14 +4370,13 @@ out:
 }
 
 /*
- * in XOR mode, we determine the output device by performing xor on
- * the source and destination hw adresses.  If this device is not
- * enabled, find the next slave following this xor slave.
+ * In bond_xmit_xor() , we determine the output device by using a pre-
+ * determined xmit_hash_policy(), If the selected device is not enabled,
+ * find the next active slave.
  */
 static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = bond_dev->priv;
-	struct ethhdr *data = (struct ethhdr *)skb->data;
 	struct slave *slave, *start_at;
 	int slave_no;
 	int i;
@@ -4329,7 +4388,7 @@ static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev)
 		goto out;
 	}
 
-	slave_no = (data->h_dest[5]^bond_dev->dev_addr[5]) % bond->slave_cnt;
+	slave_no = bond->xmit_hash_policy(skb, bond_dev, bond->slave_cnt);
 
 	bond_for_each_slave(bond, slave, i) {
 		slave_no--;
@@ -4425,8 +4484,10 @@ out:
 /*
  * set bond mode specific net device operations
  */
-static inline void bond_set_mode_ops(struct net_device *bond_dev, int mode)
+static inline void bond_set_mode_ops(struct bonding *bond, int mode)
 {
+	struct net_device *bond_dev = bond->dev;
+
 	switch (mode) {
 	case BOND_MODE_ROUNDROBIN:
 		bond_dev->hard_start_xmit = bond_xmit_roundrobin;
@@ -4436,12 +4497,20 @@ static inline void bond_set_mode_ops(struct net_device *bond_dev, int mode)
 		break;
 	case BOND_MODE_XOR:
 		bond_dev->hard_start_xmit = bond_xmit_xor;
+		if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
+			bond->xmit_hash_policy = bond_xmit_hash_policy_l34;
+		else
+			bond->xmit_hash_policy = bond_xmit_hash_policy_l2;
 		break;
 	case BOND_MODE_BROADCAST:
 		bond_dev->hard_start_xmit = bond_xmit_broadcast;
 		break;
 	case BOND_MODE_8023AD:
 		bond_dev->hard_start_xmit = bond_3ad_xmit_xor;
+		if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
+			bond->xmit_hash_policy = bond_xmit_hash_policy_l34;
+		else
+			bond->xmit_hash_policy = bond_xmit_hash_policy_l2;
 		break;
 	case BOND_MODE_TLB:
 	case BOND_MODE_ALB:
@@ -4490,7 +4559,7 @@ static int __init bond_init(struct net_device *bond_dev, struct bond_params *par
 	bond_dev->change_mtu = bond_change_mtu;
 	bond_dev->set_mac_address = bond_set_mac_address;
 
-	bond_set_mode_ops(bond_dev, bond->params.mode);
+	bond_set_mode_ops(bond, bond->params.mode);
 
 	bond_dev->destructor = free_netdev;
 
@@ -4601,6 +4670,25 @@ static int bond_check_params(struct bond_params *params)
 		}
 	}
 
+	if (xmit_hash_policy) {
+		if ((bond_mode != BOND_MODE_XOR) &&
+		    (bond_mode != BOND_MODE_8023AD)) {
+			printk(KERN_INFO DRV_NAME
+			       ": xor_mode param is irrelevant in mode %s\n",
+			       bond_mode_name(bond_mode));
+		} else {
+			xmit_hashtype = bond_parse_parm(xmit_hash_policy,
+							xmit_hashtype_tbl);
+			if (xmit_hashtype == -1) {
+				printk(KERN_ERR DRV_NAME
+			       	": Error: Invalid xmit_hash_policy \"%s\"\n",
+			       	xmit_hash_policy == NULL ? "NULL" :
+				       xmit_hash_policy);
+				return -EINVAL;
+			}
+		}
+	}
+
 	if (lacp_rate) {
 		if (bond_mode != BOND_MODE_8023AD) {
 			printk(KERN_INFO DRV_NAME
@@ -4812,6 +4900,7 @@ static int bond_check_params(struct bond_params *params)
 
 	/* fill params struct with the proper values */
 	params->mode = bond_mode;
+	params->xmit_policy = xmit_hashtype;
 	params->miimon = miimon;
 	params->arp_interval = arp_interval;
 	params->updelay = updelay;
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6558af22eda4..d27f377b3eeb 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -25,6 +25,10 @@
  *
  * 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
  *	- Code cleanup and style changes
+ *
+ * 2005/05/05 - Jason Gabler <jygabler at lbl dot gov>
+ *      - added "xmit_policy" kernel parameter for alternate hashing policy
+ *	  support for mode 2
  */
 
 #ifndef _LINUX_BONDING_H
@@ -36,8 +40,8 @@
 #include "bond_3ad.h"
 #include "bond_alb.h"
 
-#define DRV_VERSION	"2.6.2"
-#define DRV_RELDATE	"June 5, 2005"
+#define DRV_VERSION	"2.6.3"
+#define DRV_RELDATE	"June 8, 2005"
 #define DRV_NAME	"bonding"
 #define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
 
@@ -137,6 +141,7 @@
 
 struct bond_params {
 	int mode;
+	int xmit_policy;
 	int miimon;
 	int arp_interval;
 	int use_carrier;
@@ -198,6 +203,7 @@ struct bonding {
 #endif /* CONFIG_PROC_FS */
 	struct   list_head bond_list;
 	struct   dev_mc_list *mc_list;
+	int      (*xmit_hash_policy)(struct sk_buff *, struct net_device *, int);
 	u32      master_ip;
 	u16      flags;
 	struct   ad_bond_info ad_info;
diff --git a/include/linux/if_bonding.h b/include/linux/if_bonding.h
index 57024ce2c74f..84598fa2e9de 100644
--- a/include/linux/if_bonding.h
+++ b/include/linux/if_bonding.h
@@ -35,6 +35,9 @@
  *
  * 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
  *	- Code cleanup and style changes
+ *
+ * 2005/05/05 - Jason Gabler <jygabler at lbl dot gov>
+ *      - added definitions for various XOR hashing policies
  */
 
 #ifndef _LINUX_IF_BONDING_H
@@ -80,6 +83,10 @@
 
 #define BOND_DEFAULT_MAX_BONDS  1   /* Default maximum number of devices to support */
 
+/* hashing types */
+#define BOND_XMIT_POLICY_LAYER2		0 /* layer 2 (MAC only), default */
+#define BOND_XMIT_POLICY_LAYER34	1 /* layer 3+4 (IP ^ MAC) */
+
 typedef struct ifbond {
 	__s32 bond_mode;
 	__s32 num_slaves;
-- 
cgit v1.2.3-59-g8ed1b


From 32e9e25ef20789c24ffa1f41489a13932cf82c77 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 26 Jun 2005 15:28:10 -0700
Subject: [ATALK]: Include asm/byteorder.h in linux/atalk.h

We're using __be16 in userland visible types, so we
have to include asm/byteorder.h so that works.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atalk.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/atalk.h b/include/linux/atalk.h
index 09a1451c1159..911c09cb9bf9 100644
--- a/include/linux/atalk.h
+++ b/include/linux/atalk.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_ATALK_H__
 #define __LINUX_ATALK_H__
 
+#include <asm/byteorder.h>
+
 /*
  * AppleTalk networking structures
  *
-- 
cgit v1.2.3-59-g8ed1b


From f49d16ef2d6f008119d4ee2c895781fb229bad68 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 26 Jun 2005 11:36:52 +0200
Subject: [PATCH] forcedeth: Add support for new device id

This is a multi-part message in MIME format.
---
 drivers/net/forcedeth.c | 17 ++++++++++++++++-
 include/linux/pci_ids.h |  2 ++
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index b471d1a8ffdc..64f0f697c958 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -84,6 +84,7 @@
  *	0.32: 16 Apr 2005: RX_ERROR4 handling added.
  *	0.33: 16 May 2005: Support for MCP51 added.
  *	0.34: 18 Jun 2005: Add DEV_NEED_LINKTIMER to all nForce nics.
+ *	0.35: 26 Jun 2005: Support for MCP55 added.
  *
  * Known bugs:
  * We suspect that on some hardware no TX done interrupts are generated.
@@ -95,7 +96,7 @@
  * DEV_NEED_TIMERIRQ will not harm you on sane hardware, only generating a few
  * superfluous timer interrupts from the nic.
  */
-#define FORCEDETH_VERSION		"0.34"
+#define FORCEDETH_VERSION		"0.35"
 #define DRV_NAME			"forcedeth"
 
 #include <linux/module.h>
@@ -2284,6 +2285,20 @@ static struct pci_device_id pci_tbl[] = {
 		.subdevice = PCI_ANY_ID,
 		.driver_data = DEV_NEED_LASTPACKET1|DEV_IRQMASK_2|DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER,
 	},
+	{	/* MCP55 Ethernet Controller */
+		.vendor = PCI_VENDOR_ID_NVIDIA,
+		.device = PCI_DEVICE_ID_NVIDIA_NVENET_14,
+		.subvendor = PCI_ANY_ID,
+		.subdevice = PCI_ANY_ID,
+		.driver_data = DEV_NEED_LASTPACKET1|DEV_IRQMASK_2|DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER,
+	},
+	{	/* MCP55 Ethernet Controller */
+		.vendor = PCI_VENDOR_ID_NVIDIA,
+		.device = PCI_DEVICE_ID_NVIDIA_NVENET_15,
+		.subvendor = PCI_ANY_ID,
+		.subdevice = PCI_ANY_ID,
+		.driver_data = DEV_NEED_LASTPACKET1|DEV_IRQMASK_2|DEV_NEED_TIMERIRQ|DEV_NEED_LINKTIMER,
+	},
 	{0,},
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bf608808a60c..3af7450278b7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1284,6 +1284,8 @@
 #define PCI_DEVICE_ID_NVIDIA_GEFORCE_FX_GO5700_2    0x0348
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_GO1000       0x034C
 #define PCI_DEVICE_ID_NVIDIA_QUADRO_FX_1100         0x034E
+#define PCI_DEVICE_ID_NVIDIA_NVENET_14              0x0372
+#define PCI_DEVICE_ID_NVIDIA_NVENET_15              0x0373
 
 #define PCI_VENDOR_ID_IMS		0x10e0
 #define PCI_DEVICE_ID_IMS_8849		0x8849
-- 
cgit v1.2.3-59-g8ed1b


From ec9f47cd6a14ca069bb7552a984c0a338fc7262b Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Mon, 27 Jun 2005 11:12:54 +0100
Subject: [PATCH] Serial: Split 8250 port table

Add separate files for the different 8250 ISA-based serial boards.

Looking across all the various architectures, it seems reasonable that
we can key the availability of the configuration options for these
beasts to the bus-related symbols (iow, CONFIG_ISA).  We also standardise
the base baud/uart clock rate for these boards - I'm sure that isn't
architecture specific, but is solely dependent on the crystal fitted
on the board (which should be the same no matter what type of machine
its fitted into.)

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/8250.c          | 25 +++++---------
 drivers/serial/8250_accent.c   | 47 ++++++++++++++++++++++++++
 drivers/serial/8250_boca.c     | 61 ++++++++++++++++++++++++++++++++++
 drivers/serial/8250_fourport.c | 53 +++++++++++++++++++++++++++++
 drivers/serial/8250_hub6.c     | 58 ++++++++++++++++++++++++++++++++
 drivers/serial/8250_mca.c      | 64 +++++++++++++++++++++++++++++++++++
 drivers/serial/Kconfig         | 75 +++++++++++++++++++++++++++++++++---------
 drivers/serial/Makefile        |  5 +++
 include/linux/serial_8250.h    |  1 +
 9 files changed, 357 insertions(+), 32 deletions(-)
 create mode 100644 drivers/serial/8250_accent.c
 create mode 100644 drivers/serial/8250_boca.c
 create mode 100644 drivers/serial/8250_fourport.c
 create mode 100644 drivers/serial/8250_hub6.c
 create mode 100644 drivers/serial/8250_mca.c

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index d8b9d2b8c200..34e75bc8f4cc 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -77,23 +77,9 @@ static unsigned int share_irqs = SERIAL8250_SHARE_IRQS;
  */
 #define is_real_interrupt(irq)	((irq) != 0)
 
-/*
- * This converts from our new CONFIG_ symbols to the symbols
- * that asm/serial.h expects.  You _NEED_ to comment out the
- * linux/config.h include contained inside asm/serial.h for
- * this to work.
- */
-#undef CONFIG_SERIAL_MANY_PORTS
-#undef CONFIG_SERIAL_DETECT_IRQ
-#undef CONFIG_SERIAL_MULTIPORT
-#undef CONFIG_HUB6
-
 #ifdef CONFIG_SERIAL_8250_DETECT_IRQ
 #define CONFIG_SERIAL_DETECT_IRQ 1
 #endif
-#ifdef CONFIG_SERIAL_8250_MULTIPORT
-#define CONFIG_SERIAL_MULTIPORT 1
-#endif
 #ifdef CONFIG_SERIAL_8250_MANY_PORTS
 #define CONFIG_SERIAL_MANY_PORTS 1
 #endif
@@ -2323,10 +2309,11 @@ static int __devinit serial8250_probe(struct device *dev)
 {
 	struct plat_serial8250_port *p = dev->platform_data;
 	struct uart_port port;
+	int ret, i;
 
 	memset(&port, 0, sizeof(struct uart_port));
 
-	for (; p && p->flags != 0; p++) {
+	for (i = 0; p && p->flags != 0; p++, i++) {
 		port.iobase	= p->iobase;
 		port.membase	= p->membase;
 		port.irq	= p->irq;
@@ -2335,10 +2322,16 @@ static int __devinit serial8250_probe(struct device *dev)
 		port.iotype	= p->iotype;
 		port.flags	= p->flags;
 		port.mapbase	= p->mapbase;
+		port.hub6	= p->hub6;
 		port.dev	= dev;
 		if (share_irqs)
 			port.flags |= UPF_SHARE_IRQ;
-		serial8250_register_port(&port);
+		ret = serial8250_register_port(&port);
+		if (ret < 0) {
+			dev_err(dev, "unable to register port at index %d "
+				"(IO%lx MEM%lx IRQ%d): %d\n", i,
+				p->iobase, p->mapbase, p->irq, ret);
+		}
 	}
 	return 0;
 }
diff --git a/drivers/serial/8250_accent.c b/drivers/serial/8250_accent.c
new file mode 100644
index 000000000000..1f2c276063ef
--- /dev/null
+++ b/drivers/serial/8250_accent.c
@@ -0,0 +1,47 @@
+/*
+ *  linux/drivers/serial/8250_accent.c
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/serial_8250.h>
+
+#define PORT(_base,_irq)				\
+	{						\
+		.iobase		= _base,		\
+		.irq		= _irq,			\
+		.uartclk	= 1843200,		\
+		.iotype		= UPIO_PORT,		\
+		.flags		= UPF_BOOT_AUTOCONF,	\
+	}
+
+static struct plat_serial8250_port accent_data[] = {
+	PORT(0x330, 4),
+	PORT(0x338, 4),
+	{ },
+};
+
+static struct platform_device accent_device = {
+	.name			= "serial8250",
+	.id			= 2,
+	.dev			= {
+		.platform_data	= accent_data,
+	},
+};
+
+static int __init accent_init(void)
+{
+	return platform_device_register(&accent_device);
+}
+
+module_init(accent_init);
+
+MODULE_AUTHOR("Russell King");
+MODULE_DESCRIPTION("8250 serial probe module for Accent Async cards");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/8250_boca.c b/drivers/serial/8250_boca.c
new file mode 100644
index 000000000000..465c9ea1e7a3
--- /dev/null
+++ b/drivers/serial/8250_boca.c
@@ -0,0 +1,61 @@
+/*
+ *  linux/drivers/serial/8250_boca.c
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/serial_8250.h>
+
+#define PORT(_base,_irq)				\
+	{						\
+		.iobase		= _base,		\
+		.irq		= _irq,			\
+		.uartclk	= 1843200,		\
+		.iotype		= UPIO_PORT,		\
+		.flags		= UPF_BOOT_AUTOCONF,	\
+	}
+
+static struct plat_serial8250_port boca_data[] = {
+	PORT(0x100, 12),
+	PORT(0x108, 12),
+	PORT(0x110, 12),
+	PORT(0x118, 12),
+	PORT(0x120, 12),
+	PORT(0x128, 12),
+	PORT(0x130, 12),
+	PORT(0x138, 12),
+	PORT(0x140, 12),
+	PORT(0x148, 12),
+	PORT(0x150, 12),
+	PORT(0x158, 12),
+	PORT(0x160, 12),
+	PORT(0x168, 12),
+	PORT(0x170, 12),
+	PORT(0x178, 12),
+	{ },
+};
+
+static struct platform_device boca_device = {
+	.name			= "serial8250",
+	.id			= 3,
+	.dev			= {
+		.platform_data	= boca_data,
+	},
+};
+
+static int __init boca_init(void)
+{
+	return platform_device_register(&boca_device);
+}
+
+module_init(boca_init);
+
+MODULE_AUTHOR("Russell King");
+MODULE_DESCRIPTION("8250 serial probe module for Boca cards");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/8250_fourport.c b/drivers/serial/8250_fourport.c
new file mode 100644
index 000000000000..e9b4d908ef42
--- /dev/null
+++ b/drivers/serial/8250_fourport.c
@@ -0,0 +1,53 @@
+/*
+ *  linux/drivers/serial/8250_fourport.c
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/serial_8250.h>
+
+#define PORT(_base,_irq)						\
+	{								\
+		.iobase		= _base,				\
+		.irq		= _irq,					\
+		.uartclk	= 1843200,				\
+		.iotype		= UPIO_PORT,				\
+		.flags		= UPF_BOOT_AUTOCONF | UPF_FOURPORT,	\
+	}
+
+static struct plat_serial8250_port fourport_data[] = {
+	PORT(0x1a0, 9),
+	PORT(0x1a8, 9),
+	PORT(0x1b0, 9),
+	PORT(0x1b8, 9),
+	PORT(0x2a0, 5),
+	PORT(0x2a8, 5),
+	PORT(0x2b0, 5),
+	PORT(0x2b8, 5),
+	{ },
+};
+
+static struct platform_device fourport_device = {
+	.name			= "serial8250",
+	.id			= 1,
+	.dev			= {
+		.platform_data	= fourport_data,
+	},
+};
+
+static int __init fourport_init(void)
+{
+	return platform_device_register(&fourport_device);
+}
+
+module_init(fourport_init);
+
+MODULE_AUTHOR("Russell King");
+MODULE_DESCRIPTION("8250 serial probe module for AST Fourport cards");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/8250_hub6.c b/drivers/serial/8250_hub6.c
new file mode 100644
index 000000000000..77f396f84b4c
--- /dev/null
+++ b/drivers/serial/8250_hub6.c
@@ -0,0 +1,58 @@
+/*
+ *  linux/drivers/serial/8250_hub6.c
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/serial_8250.h>
+
+#define HUB6(card,port)							\
+	{								\
+		.iobase		= 0x302,				\
+		.irq		= 3,					\
+		.uartclk	= 1843200,				\
+		.iotype		= UPIO_HUB6,				\
+		.flags		= UPF_BOOT_AUTOCONF,			\
+		.hub6		= (card) << 6 | (port) << 3 | 1,	\
+	}
+
+static struct plat_serial8250_port hub6_data[] = {
+	HUB6(0,0),
+	HUB6(0,1),
+	HUB6(0,2),
+	HUB6(0,3),
+	HUB6(0,4),
+	HUB6(0,5),
+	HUB6(1,0),
+	HUB6(1,1),
+	HUB6(1,2),
+	HUB6(1,3),
+	HUB6(1,4),
+	HUB6(1,5),
+	{ },
+};
+
+static struct platform_device hub6_device = {
+	.name			= "serial8250",
+	.id			= 4,
+	.dev			= {
+		.platform_data	= hub6_data,
+	},
+};
+
+static int __init hub6_init(void)
+{
+	return platform_device_register(&hub6_device);
+}
+
+module_init(hub6_init);
+
+MODULE_AUTHOR("Russell King");
+MODULE_DESCRIPTION("8250 serial probe module for Hub6 cards");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/8250_mca.c b/drivers/serial/8250_mca.c
new file mode 100644
index 000000000000..f0c40d68b8c1
--- /dev/null
+++ b/drivers/serial/8250_mca.c
@@ -0,0 +1,64 @@
+/*
+ *  linux/drivers/serial/8250_mca.c
+ *
+ *  Copyright (C) 2005 Russell King.
+ *  Data taken from include/asm-i386/serial.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mca.h>
+#include <linux/serial_8250.h>
+
+/*
+ * FIXME: Should we be doing AUTO_IRQ here?
+ */
+#ifdef CONFIG_SERIAL_8250_DETECT_IRQ
+#define MCA_FLAGS	UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ
+#else
+#define MCA_FLAGS	UPF_BOOT_AUTOCONF | UPF_SKIP_TEST
+#endif
+
+#define PORT(_base,_irq)			\
+	{					\
+		.iobase		= _base,	\
+		.irq		= _irq,		\
+		.uartclk	= 1843200,	\
+		.iotype		= UPIO_PORT,	\
+		.flags		= MCA_FLAGS,	\
+	}
+
+static struct plat_serial8250_port mca_data[] = {
+	PORT(0x3220, 3),
+	PORT(0x3228, 3),
+	PORT(0x4220, 3),
+	PORT(0x4228, 3),
+	PORT(0x5220, 3),
+	PORT(0x5228, 3),
+	{ },
+};
+
+static struct platform_device mca_device = {
+	.name			= "serial8250",
+	.id			= 5,
+	.dev			= {
+		.platform_data	= mca_data,
+	},
+};
+
+static int __init mca_init(void)
+{
+	if (!MCA_bus)
+		return -ENODEV;
+	return platform_device_register(&mca_device);
+}
+
+module_init(mca_init);
+
+MODULE_AUTHOR("Russell King");
+MODULE_DESCRIPTION("8250 serial probe module for MCA ports");
+MODULE_LICENSE("GPL");
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 25fcef2c42de..e879bce160df 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -89,11 +89,11 @@ config SERIAL_8250_NR_UARTS
 	int "Maximum number of non-legacy 8250/16550 serial ports"
 	depends on SERIAL_8250
 	default "4"
-	---help---
-	  Set this to the number of non-legacy serial ports you want
-	  the driver to support.  This includes any ports discovered
-	  via ACPI or PCI enumeration and any ports that may be added
-	  at run-time via hot-plug.
+	help
+	  Set this to the number of serial ports you want the driver
+	  to support.  This includes any ports discovered via ACPI or
+	  PCI enumeration and any ports that may be added at run-time
+	  via hot-plug, or any ISA multi-port serial cards.
 
 config SERIAL_8250_EXTENDED
 	bool "Extended 8250/16550 serial driver options"
@@ -141,31 +141,74 @@ config SERIAL_8250_DETECT_IRQ
 
 	  If unsure, say N.
 
-config SERIAL_8250_MULTIPORT
-	bool "Support special multiport boards"
-	depends on SERIAL_8250_EXTENDED
-	help
-	  Some multiport serial ports have special ports which are used to
-	  signal when there are any serial ports on the board which need
-	  servicing. Say Y here to enable the serial driver to take advantage
-	  of those special I/O ports.
-
 config SERIAL_8250_RSA
 	bool "Support RSA serial ports"
 	depends on SERIAL_8250_EXTENDED
 	help
 	  ::: To be written :::
 
-comment "Non-8250 serial port support"
+#
+# Multi-port serial cards
+#
+
+config SERIAL_8250_FOURPORT
+	tristate "Support Fourport cards"
+	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
+	help
+	  Say Y here if you have an AST FourPort serial board.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_fourport.
+
+config SERIAL_8250_ACCENT
+	tristate "Support Accent cards"
+	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
+	help
+	  Say Y here if you have an Accent Async serial board.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_accent.
+
+
+config SERIAL_8250_BOCA
+	tristate "Support Boca cards"
+	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
+	help
+	  Say Y here if you have a Boca serial board.  Please read the Boca
+	  mini-HOWTO, avaialble from <http://www.tldp.org/docs.html#howto>
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_boca.
+
+
+config SERIAL_8250_HUB6
+	tristate "Support Hub6 cards"
+	depends on SERIAL_8250 != n && ISA && SERIAL_8250_MANY_PORTS
+	help
+	  Say Y here if you have a HUB6 serial board.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_hub6.
+
+config SERIAL_8250_MCA
+	tristate "Support 8250-type ports on MCA buses"
+	depends on SERIAL_8250 != n && MCA
+	help
+	  Say Y here if you have a MCA serial ports.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called 8250_mca.
 
 config SERIAL_8250_ACORN
 	tristate "Acorn expansion card serial port support"
-	depends on ARM && ARCH_ACORN && SERIAL_8250
+	depends on ARCH_ACORN && SERIAL_8250
 	help
 	  If you have an Atomwide Serial card or Serial Port card for an Acorn
 	  system, say Y to this option.  The driver can handle 1, 2, or 3 port
 	  cards.  If unsure, say N.
 
+comment "Non-8250 serial port support"
+
 config SERIAL_AMBA_PL010
 	tristate "ARM AMBA PL010 serial port support"
 	depends on ARM_AMBA
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index 8f1cdde7dbed..65bd4381685e 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -17,6 +17,11 @@ obj-$(CONFIG_SERIAL_8250) += 8250.o $(serial-8250-y)
 obj-$(CONFIG_SERIAL_8250_CS) += serial_cs.o
 obj-$(CONFIG_SERIAL_8250_ACORN) += 8250_acorn.o
 obj-$(CONFIG_SERIAL_8250_CONSOLE) += 8250_early.o
+obj-$(CONFIG_SERIAL_8250_FOURPORT) += 8250_fourport.o
+obj-$(CONFIG_SERIAL_8250_ACCENT) += 8250_accent.o
+obj-$(CONFIG_SERIAL_8250_BOCA) += 8250_boca.o
+obj-$(CONFIG_SERIAL_8250_HUB6) += 8250_hub6.o
+obj-$(CONFIG_SERIAL_8250_MCA) += 8250_mca.o
 obj-$(CONFIG_SERIAL_AMBA_PL010) += amba-pl010.o
 obj-$(CONFIG_SERIAL_AMBA_PL011) += amba-pl011.o
 obj-$(CONFIG_SERIAL_CLPS711X) += clps711x.o
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 823181af6ddf..3e3c1fa35b06 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -22,6 +22,7 @@ struct plat_serial8250_port {
 	unsigned int	uartclk;	/* UART clock rate */
 	unsigned char	regshift;	/* register shift */
 	unsigned char	iotype;		/* UPIO_* */
+	unsigned char	hub6;
 	unsigned int	flags;		/* UPF_* flags */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 22e2c507c301c3dbbcf91b4948b88f78842ee6c9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 10:55:12 +0200
Subject: [PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq
v3).  It provides full process fairness, while giving excellent
aggregate system throughput even for many competing processes.  It
supports io priorities, either inherited from the cpu nice value or set
directly with the ioprio_get/set syscalls.  The latter closely mimic
set/getpriority.

This import is based on my latest from -mm.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/syscall_table.S |    2 +
 arch/ia64/kernel/entry.S         |    4 +-
 arch/ppc/kernel/misc.S           |    2 +
 drivers/block/as-iosched.c       |    5 +-
 drivers/block/cfq-iosched.c      | 1910 +++++++++++++++++++++++++-------------
 drivers/block/deadline-iosched.c |    3 +-
 drivers/block/elevator.c         |    9 +-
 drivers/block/ll_rw_blk.c        |   59 +-
 fs/Makefile                      |    1 +
 fs/ioprio.c                      |  172 ++++
 fs/reiserfs/journal.c            |   12 +
 include/asm-i386/unistd.h        |    4 +-
 include/asm-ia64/unistd.h        |    2 +
 include/asm-ppc/unistd.h         |    4 +-
 include/asm-x86_64/unistd.h      |    6 +-
 include/linux/bio.h              |   14 +
 include/linux/blkdev.h           |   25 +-
 include/linux/elevator.h         |    8 +-
 include/linux/fs.h               |   19 +
 include/linux/init_task.h        |    2 +
 include/linux/ioprio.h           |   87 ++
 include/linux/sched.h            |    6 +-
 include/linux/writeback.h        |    6 +-
 kernel/exit.c                    |    2 +
 kernel/fork.c                    |    5 +
 kernel/sched.c                   |    8 -
 26 files changed, 1685 insertions(+), 692 deletions(-)
 create mode 100644 fs/ioprio.c
 create mode 100644 include/linux/ioprio.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 442a6e937b19..3db9a04aec6e 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -289,3 +289,5 @@ ENTRY(sys_call_table)
 	.long sys_add_key
 	.long sys_request_key
 	.long sys_keyctl
+	.long sys_ioprio_set
+	.long sys_ioprio_get		/* 290 */
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index b1d5d3d5276c..785a51b0ad8e 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1577,8 +1577,8 @@ sys_call_table:
 	data8 sys_add_key
 	data8 sys_request_key
 	data8 sys_keyctl
-	data8 sys_ni_syscall
-	data8 sys_ni_syscall			// 1275
+	data8 sys_ioprio_set
+	data8 sys_ioprio_get			// 1275
 	data8 sys_set_zone_reclaim
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S
index b6a63a49a232..191a8def3bdb 100644
--- a/arch/ppc/kernel/misc.S
+++ b/arch/ppc/kernel/misc.S
@@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
 	.long sys_request_key		/* 270 */
 	.long sys_keyctl
 	.long sys_waitid
+	.long sys_ioprio_set
+	.long sys_ioprio_get
diff --git a/drivers/block/as-iosched.c b/drivers/block/as-iosched.c
index 3410b4d294b9..91aeb678135d 100644
--- a/drivers/block/as-iosched.c
+++ b/drivers/block/as-iosched.c
@@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
 	rq->elevator_private = NULL;
 }
 
-static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+static int as_set_request(request_queue_t *q, struct request *rq,
+			  struct bio *bio, int gfp_mask)
 {
 	struct as_data *ad = q->elevator->elevator_data;
 	struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
@@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
 	return 1;
 }
 
-static int as_may_queue(request_queue_t *q, int rw)
+static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	int ret = ELV_MQUEUE_MAY;
 	struct as_data *ad = q->elevator->elevator_data;
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index 3ac47dde64da..35f6e569d5e5 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -21,22 +21,33 @@
 #include <linux/hash.h>
 #include <linux/rbtree.h>
 #include <linux/mempool.h>
-
-static unsigned long max_elapsed_crq;
-static unsigned long max_elapsed_dispatch;
+#include <linux/ioprio.h>
+#include <linux/writeback.h>
 
 /*
  * tunables
  */
 static int cfq_quantum = 4;		/* max queue in one round of service */
 static int cfq_queued = 8;		/* minimum rq allocate limit per-queue*/
-static int cfq_service = HZ;		/* period over which service is avg */
-static int cfq_fifo_expire_r = HZ / 2;	/* fifo timeout for sync requests */
-static int cfq_fifo_expire_w = 5 * HZ;	/* fifo timeout for async requests */
-static int cfq_fifo_rate = HZ / 8;	/* fifo expiry rate */
+static int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
 static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
+static int cfq_slice_sync = HZ / 10;
+static int cfq_slice_async = HZ / 50;
+static int cfq_slice_async_rq = 2;
+static int cfq_slice_idle = HZ / 50;
+
+#define CFQ_IDLE_GRACE		(HZ / 10)
+#define CFQ_SLICE_SCALE		(5)
+
+#define CFQ_KEY_ASYNC		(0)
+
+/*
+ * disable queueing at the driver/hardware level
+ */
+static int cfq_max_depth = 1;
+
 /*
  * for the hash of cfqq inside the cfqd
  */
@@ -55,6 +66,7 @@ static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 #define list_entry_hash(ptr)	hlist_entry((ptr), struct cfq_rq, hash)
 
 #define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
+#define list_entry_fifo(ptr)	list_entry((ptr), struct request, queuelist)
 
 #define RQ_DATA(rq)		(rq)->elevator_private
 
@@ -75,78 +87,101 @@ static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 #define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
 #define rq_rb_key(rq)		(rq)->sector
 
-/*
- * threshold for switching off non-tag accounting
- */
-#define CFQ_MAX_TAG		(4)
-
-/*
- * sort key types and names
- */
-enum {
-	CFQ_KEY_PGID,
-	CFQ_KEY_TGID,
-	CFQ_KEY_UID,
-	CFQ_KEY_GID,
-	CFQ_KEY_LAST,
-};
-
-static char *cfq_key_types[] = { "pgid", "tgid", "uid", "gid", NULL };
-
 static kmem_cache_t *crq_pool;
 static kmem_cache_t *cfq_pool;
 static kmem_cache_t *cfq_ioc_pool;
 
+#define CFQ_PRIO_LISTS		IOPRIO_BE_NR
+#define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
+#define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+
+/*
+ * Per block device queue structure
+ */
 struct cfq_data {
-	struct list_head rr_list;
+	atomic_t ref;
+	request_queue_t *queue;
+
+	/*
+	 * rr list of queues with requests and the count of them
+	 */
+	struct list_head rr_list[CFQ_PRIO_LISTS];
+	struct list_head busy_rr;
+	struct list_head cur_rr;
+	struct list_head idle_rr;
+	unsigned int busy_queues;
+
+	/*
+	 * non-ordered list of empty cfqq's
+	 */
 	struct list_head empty_list;
 
+	/*
+	 * cfqq lookup hash
+	 */
 	struct hlist_head *cfq_hash;
-	struct hlist_head *crq_hash;
 
-	/* queues on rr_list (ie they have pending requests */
-	unsigned int busy_queues;
+	/*
+	 * global crq hash for all queues
+	 */
+	struct hlist_head *crq_hash;
 
 	unsigned int max_queued;
 
-	atomic_t ref;
+	mempool_t *crq_pool;
 
-	int key_type;
+	int rq_in_driver;
 
-	mempool_t *crq_pool;
+	/*
+	 * schedule slice state info
+	 */
+	/*
+	 * idle window management
+	 */
+	struct timer_list idle_slice_timer;
+	struct work_struct unplug_work;
 
-	request_queue_t *queue;
+	struct cfq_queue *active_queue;
+	struct cfq_io_context *active_cic;
+	int cur_prio, cur_end_prio;
+	unsigned int dispatch_slice;
+
+	struct timer_list idle_class_timer;
 
 	sector_t last_sector;
+	unsigned long last_end_request;
 
-	int rq_in_driver;
+	unsigned int rq_starved;
 
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_queued;
-	unsigned int cfq_fifo_expire_r;
-	unsigned int cfq_fifo_expire_w;
-	unsigned int cfq_fifo_batch_expire;
+	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
-	unsigned int find_best_crq;
-
-	unsigned int cfq_tagged;
+	unsigned int cfq_slice[2];
+	unsigned int cfq_slice_async_rq;
+	unsigned int cfq_slice_idle;
+	unsigned int cfq_max_depth;
 };
 
+/*
+ * Per process-grouping structure
+ */
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
-	/* hash of mergeable requests */
+	/* cfqq lookup hash */
 	struct hlist_node cfq_hash;
 	/* hash key */
-	unsigned long key;
-	/* whether queue is on rr (or empty) list */
-	int on_rr;
+	unsigned int key;
 	/* on either rr or empty list of cfqd */
 	struct list_head cfq_list;
 	/* sorted list of pending requests */
@@ -158,21 +193,35 @@ struct cfq_queue {
 	/* currently allocated requests */
 	int allocated[2];
 	/* fifo list of requests in sort_list */
-	struct list_head fifo[2];
-	/* last time fifo expired */
-	unsigned long last_fifo_expire;
-
-	int key_type;
-
-	unsigned long service_start;
-	unsigned long service_used;
+	struct list_head fifo;
 
-	unsigned int max_rate;
+	unsigned long slice_start;
+	unsigned long slice_end;
+	unsigned long slice_left;
+	unsigned long service_last;
 
 	/* number of requests that have been handed to the driver */
 	int in_flight;
-	/* number of currently allocated requests */
-	int alloc_limit[2];
+
+	/* io prio of this group */
+	unsigned short ioprio, org_ioprio;
+	unsigned short ioprio_class, org_ioprio_class;
+
+	/* whether queue is on rr (or empty) list */
+	unsigned on_rr : 1;
+	/* idle slice, waiting for new request submission */
+	unsigned wait_request : 1;
+	/* set when wait_request gets set, reset on first rq alloc */
+	unsigned must_alloc : 1;
+	/* only gets one must_alloc per slice */
+	unsigned must_alloc_slice : 1;
+	/* idle slice, request added, now waiting to dispatch it */
+	unsigned must_dispatch : 1;
+	/* fifo expire per-slice */
+	unsigned fifo_expire : 1;
+
+	unsigned idle_window : 1;
+	unsigned prio_changed : 1;
 };
 
 struct cfq_rq {
@@ -184,42 +233,17 @@ struct cfq_rq {
 	struct cfq_queue *cfq_queue;
 	struct cfq_io_context *io_context;
 
-	unsigned long service_start;
-	unsigned long queue_start;
-
-	unsigned int in_flight : 1;
-	unsigned int accounted : 1;
-	unsigned int is_sync   : 1;
-	unsigned int is_write  : 1;
+	unsigned in_flight : 1;
+	unsigned accounted : 1;
+	unsigned is_sync   : 1;
+	unsigned requeued  : 1;
 };
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned long);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
-static void cfq_update_next_crq(struct cfq_rq *);
 static void cfq_put_cfqd(struct cfq_data *cfqd);
 
-/*
- * what the fairness is based on (ie how processes are grouped and
- * differentiated)
- */
-static inline unsigned long
-cfq_hash_key(struct cfq_data *cfqd, struct task_struct *tsk)
-{
-	/*
-	 * optimize this so that ->key_type is the offset into the struct
-	 */
-	switch (cfqd->key_type) {
-		case CFQ_KEY_PGID:
-			return process_group(tsk);
-		default:
-		case CFQ_KEY_TGID:
-			return tsk->tgid;
-		case CFQ_KEY_UID:
-			return tsk->uid;
-		case CFQ_KEY_GID:
-			return tsk->gid;
-	}
-}
+#define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
 /*
  * lots of deadline iosched dupes, can be abstracted later...
@@ -235,16 +259,12 @@ static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
 
 	if (q->last_merge == crq->request)
 		q->last_merge = NULL;
-
-	cfq_update_next_crq(crq);
 }
 
 static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
 {
 	const int hash_idx = CFQ_MHASH_FN(rq_hash_key(crq->request));
 
-	BUG_ON(!hlist_unhashed(&crq->hash));
-
 	hlist_add_head(&crq->hash, &cfqd->crq_hash[hash_idx]);
 }
 
@@ -257,8 +277,6 @@ static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
 		struct cfq_rq *crq = list_entry_hash(entry);
 		struct request *__rq = crq->request;
 
-		BUG_ON(hlist_unhashed(&crq->hash));
-
 		if (!rq_mergeable(__rq)) {
 			cfq_del_crq_hash(crq);
 			continue;
@@ -287,36 +305,16 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 		return crq2;
 	if (crq2 == NULL)
 		return crq1;
+	if (crq1->requeued)
+		return crq1;
+	if (crq2->requeued)
+		return crq2;
 
 	s1 = crq1->request->sector;
 	s2 = crq2->request->sector;
 
 	last = cfqd->last_sector;
 
-#if 0
-	if (!list_empty(&cfqd->queue->queue_head)) {
-		struct list_head *entry = &cfqd->queue->queue_head;
-		unsigned long distance = ~0UL;
-		struct request *rq;
-
-		while ((entry = entry->prev) != &cfqd->queue->queue_head) {
-			rq = list_entry_rq(entry);
-
-			if (blk_barrier_rq(rq))
-				break;
-
-			if (distance < abs(s1 - rq->sector + rq->nr_sectors)) {
-				distance = abs(s1 - rq->sector +rq->nr_sectors);
-				last = rq->sector + rq->nr_sectors;
-			}
-			if (distance < abs(s2 - rq->sector + rq->nr_sectors)) {
-				distance = abs(s2 - rq->sector +rq->nr_sectors);
-				last = rq->sector + rq->nr_sectors;
-			}
-		}
-	}
-#endif
-
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
@@ -377,11 +375,13 @@ cfq_find_next_crq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	struct cfq_rq *crq_next = NULL, *crq_prev = NULL;
 	struct rb_node *rbnext, *rbprev;
 
-	if (!ON_RB(&last->rb_node))
-		return NULL;
-
-	if ((rbnext = rb_next(&last->rb_node)) == NULL)
+	if (ON_RB(&last->rb_node))
+		rbnext = rb_next(&last->rb_node);
+	else {
 		rbnext = rb_first(&cfqq->sort_list);
+		if (rbnext == &last->rb_node)
+			rbnext = NULL;
+	}
 
 	rbprev = rb_prev(&last->rb_node);
 
@@ -401,67 +401,53 @@ static void cfq_update_next_crq(struct cfq_rq *crq)
 		cfqq->next_crq = cfq_find_next_crq(cfqq->cfqd, cfqq, crq);
 }
 
-static int cfq_check_sort_rr_list(struct cfq_queue *cfqq)
+static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 {
-	struct list_head *head = &cfqq->cfqd->rr_list;
-	struct list_head *next, *prev;
-
-	/*
-	 * list might still be ordered
-	 */
-	next = cfqq->cfq_list.next;
-	if (next != head) {
-		struct cfq_queue *cnext = list_entry_cfqq(next);
+	struct cfq_data *cfqd = cfqq->cfqd;
+	struct list_head *list, *entry;
 
-		if (cfqq->service_used > cnext->service_used)
-			return 1;
-	}
+	BUG_ON(!cfqq->on_rr);
 
-	prev = cfqq->cfq_list.prev;
-	if (prev != head) {
-		struct cfq_queue *cprev = list_entry_cfqq(prev);
+	list_del(&cfqq->cfq_list);
 
-		if (cfqq->service_used < cprev->service_used)
-			return 1;
+	if (cfq_class_rt(cfqq))
+		list = &cfqd->cur_rr;
+	else if (cfq_class_idle(cfqq))
+		list = &cfqd->idle_rr;
+	else {
+		/*
+		 * if cfqq has requests in flight, don't allow it to be
+		 * found in cfq_set_active_queue before it has finished them.
+		 * this is done to increase fairness between a process that
+		 * has lots of io pending vs one that only generates one
+		 * sporadically or synchronously
+		 */
+		if (cfqq->in_flight)
+			list = &cfqd->busy_rr;
+		else
+			list = &cfqd->rr_list[cfqq->ioprio];
 	}
 
-	return 0;
-}
-
-static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
-{
-	struct list_head *entry = &cfqq->cfqd->rr_list;
-
-	if (!cfqq->on_rr)
-		return;
-	if (!new_queue && !cfq_check_sort_rr_list(cfqq))
+	/*
+	 * if queue was preempted, just add to front to be fair. busy_rr
+	 * isn't sorted.
+	 */
+	if (preempted || list == &cfqd->busy_rr) {
+		list_add(&cfqq->cfq_list, list);
 		return;
-
-	list_del(&cfqq->cfq_list);
+	}
 
 	/*
-	 * sort by our mean service_used, sub-sort by in-flight requests
+	 * sort by when queue was last serviced
 	 */
-	while ((entry = entry->prev) != &cfqq->cfqd->rr_list) {
+	entry = list;
+	while ((entry = entry->prev) != list) {
 		struct cfq_queue *__cfqq = list_entry_cfqq(entry);
 
-		if (cfqq->service_used > __cfqq->service_used)
+		if (!__cfqq->service_last)
+			break;
+		if (time_before(__cfqq->service_last, cfqq->service_last))
 			break;
-		else if (cfqq->service_used == __cfqq->service_used) {
-			struct list_head *prv;
-
-			while ((prv = entry->prev) != &cfqq->cfqd->rr_list) {
-				__cfqq = list_entry_cfqq(prv);
-
-				WARN_ON(__cfqq->service_used > cfqq->service_used);
-				if (cfqq->service_used != __cfqq->service_used)
-					break;
-				if (cfqq->in_flight > __cfqq->in_flight)
-					break;
-
-				entry = prv;
-			}
-		}
 	}
 
 	list_add(&cfqq->cfq_list, entry);
@@ -469,28 +455,24 @@ static void cfq_sort_rr_list(struct cfq_queue *cfqq, int new_queue)
 
 /*
  * add to busy list of queues for service, trying to be fair in ordering
- * the pending list according to requests serviced
+ * the pending list according to last request service
  */
 static inline void
-cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 {
-	/*
-	 * it's currently on the empty list
-	 */
+	BUG_ON(cfqq->on_rr);
 	cfqq->on_rr = 1;
 	cfqd->busy_queues++;
 
-	if (time_after(jiffies, cfqq->service_start + cfq_service))
-		cfqq->service_used >>= 3;
-
-	cfq_sort_rr_list(cfqq, 1);
+	cfq_resort_rr_list(cfqq, requeue);
 }
 
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	list_move(&cfqq->cfq_list, &cfqd->empty_list);
+	BUG_ON(!cfqq->on_rr);
 	cfqq->on_rr = 0;
+	list_move(&cfqq->cfq_list, &cfqd->empty_list);
 
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
@@ -505,16 +487,17 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	if (ON_RB(&crq->rb_node)) {
 		struct cfq_data *cfqd = cfqq->cfqd;
+		const int sync = crq->is_sync;
 
-		BUG_ON(!cfqq->queued[crq->is_sync]);
+		BUG_ON(!cfqq->queued[sync]);
+		cfqq->queued[sync]--;
 
 		cfq_update_next_crq(crq);
 
-		cfqq->queued[crq->is_sync]--;
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
 		RB_CLEAR_COLOR(&crq->rb_node);
 
-		if (RB_EMPTY(&cfqq->sort_list) && cfqq->on_rr)
+		if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
 			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
@@ -562,7 +545,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 
 	if (!cfqq->on_rr)
-		cfq_add_cfqq_rr(cfqd, cfqq);
+		cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
 
 	/*
 	 * check if this request is a better next-serve candidate
@@ -581,11 +564,10 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	cfq_add_crq_rb(crq);
 }
 
-static struct request *
-cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+
 {
-	const unsigned long key = cfq_hash_key(cfqd, current);
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, key);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -609,20 +591,23 @@ out:
 
 static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		if (cfqq->cfqd->cfq_tagged) {
-			cfqq->service_used--;
-			cfq_sort_rr_list(cfqq, 0);
-		}
-
 		if (crq->accounted) {
 			crq->accounted = 0;
-			cfqq->cfqd->rq_in_driver--;
+			WARN_ON(!cfqd->rq_in_driver);
+			cfqd->rq_in_driver--;
 		}
+		if (crq->in_flight) {
+			crq->in_flight = 0;
+			WARN_ON(!cfqq->in_flight);
+			cfqq->in_flight--;
+		}
+		crq->requeued = 1;
 	}
 }
 
@@ -640,11 +625,10 @@ static void cfq_remove_request(request_queue_t *q, struct request *rq)
 	struct cfq_rq *crq = RQ_DATA(rq);
 
 	if (crq) {
-		cfq_remove_merge_hints(q, crq);
 		list_del_init(&rq->queuelist);
+		cfq_del_crq_rb(crq);
+		cfq_remove_merge_hints(q, crq);
 
-		if (crq->cfq_queue)
-			cfq_del_crq_rb(crq);
 	}
 }
 
@@ -662,21 +646,15 @@ cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
 	}
 
 	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
-	if (__rq) {
-		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
-
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_BACK_MERGE;
-			goto out;
-		}
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		ret = ELEVATOR_BACK_MERGE;
+		goto out;
 	}
 
 	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
-	if (__rq) {
-		if (elv_rq_merge_ok(__rq, bio)) {
-			ret = ELEVATOR_FRONT_MERGE;
-			goto out;
-		}
+	if (__rq && elv_rq_merge_ok(__rq, bio)) {
+		ret = ELEVATOR_FRONT_MERGE;
+		goto out;
 	}
 
 	return ELEVATOR_NO_MERGE;
@@ -709,20 +687,194 @@ static void
 cfq_merged_requests(request_queue_t *q, struct request *rq,
 		    struct request *next)
 {
-	struct cfq_rq *crq = RQ_DATA(rq);
-	struct cfq_rq *cnext = RQ_DATA(next);
-
 	cfq_merged_request(q, rq);
 
-	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist)) {
-		if (time_before(cnext->queue_start, crq->queue_start)) {
-			list_move(&rq->queuelist, &next->queuelist);
-			crq->queue_start = cnext->queue_start;
+	/*
+	 * reposition in fifo if next is older than rq
+	 */
+	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    time_before(next->start_time, rq->start_time))
+		list_move(&rq->queuelist, &next->queuelist);
+
+	cfq_remove_request(q, next);
+}
+
+static inline void
+__cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		/*
+		 * stop potential idle class queues waiting service
+		 */
+		del_timer(&cfqd->idle_class_timer);
+
+		cfqq->slice_start = jiffies;
+		cfqq->slice_end = 0;
+		cfqq->slice_left = 0;
+		cfqq->must_alloc_slice = 0;
+		cfqq->fifo_expire = 0;
+	}
+
+	cfqd->active_queue = cfqq;
+}
+
+/*
+ * 0
+ * 0,1
+ * 0,1,2
+ * 0,1,2,3
+ * 0,1,2,3,4
+ * 0,1,2,3,4,5
+ * 0,1,2,3,4,5,6
+ * 0,1,2,3,4,5,6,7
+ */
+static int cfq_get_next_prio_level(struct cfq_data *cfqd)
+{
+	int prio, wrap;
+
+	prio = -1;
+	wrap = 0;
+	do {
+		int p;
+
+		for (p = cfqd->cur_prio; p <= cfqd->cur_end_prio; p++) {
+			if (!list_empty(&cfqd->rr_list[p])) {
+				prio = p;
+				break;
+			}
+		}
+
+		if (prio != -1)
+			break;
+		cfqd->cur_prio = 0;
+		if (++cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
+			cfqd->cur_end_prio = 0;
+			if (wrap)
+				break;
+			wrap = 1;
 		}
+	} while (1);
+
+	if (unlikely(prio == -1))
+		return -1;
+
+	BUG_ON(prio >= CFQ_PRIO_LISTS);
+
+	list_splice_init(&cfqd->rr_list[prio], &cfqd->cur_rr);
+
+	cfqd->cur_prio = prio + 1;
+	if (cfqd->cur_prio > cfqd->cur_end_prio) {
+		cfqd->cur_end_prio = cfqd->cur_prio;
+		cfqd->cur_prio = 0;
+	}
+	if (cfqd->cur_end_prio == CFQ_PRIO_LISTS) {
+		cfqd->cur_prio = 0;
+		cfqd->cur_end_prio = 0;
 	}
 
-	cfq_update_next_crq(cnext);
-	cfq_remove_request(q, next);
+	return prio;
+}
+
+static void cfq_set_active_queue(struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq = NULL;
+
+	/*
+	 * if current list is non-empty, grab first entry. if it is empty,
+	 * get next prio level and grab first entry then if any are spliced
+	 */
+	if (!list_empty(&cfqd->cur_rr) || cfq_get_next_prio_level(cfqd) != -1)
+		cfqq = list_entry_cfqq(cfqd->cur_rr.next);
+
+	/*
+	 * if we have idle queues and no rt or be queues had pending
+	 * requests, either allow immediate service if the grace period
+	 * has passed or arm the idle grace timer
+	 */
+	if (!cfqq && !list_empty(&cfqd->idle_rr)) {
+		unsigned long end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+
+		if (time_after_eq(jiffies, end))
+			cfqq = list_entry_cfqq(cfqd->idle_rr.next);
+		else
+			mod_timer(&cfqd->idle_class_timer, end);
+	}
+
+	__cfq_set_active_queue(cfqd, cfqq);
+}
+
+/*
+ * current cfqq expired its slice (or was too idle), select new one
+ */
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfqq) {
+		unsigned long now = jiffies;
+
+		if (cfqq->wait_request)
+			del_timer(&cfqd->idle_slice_timer);
+
+		if (!preempted && !cfqq->in_flight)
+			cfqq->service_last = now;
+
+		cfqq->must_dispatch = 0;
+		cfqq->wait_request = 0;
+
+		/*
+		 * store what was left of this slice, if the queue idled out
+		 * or was preempted
+		 */
+		if (time_after(now, cfqq->slice_end))
+			cfqq->slice_left = now - cfqq->slice_end;
+		else
+			cfqq->slice_left = 0;
+
+		if (cfqq->on_rr)
+			cfq_resort_rr_list(cfqq, preempted);
+
+		cfqd->active_queue = NULL;
+
+		if (cfqd->active_cic) {
+			put_io_context(cfqd->active_cic->ioc);
+			cfqd->active_cic = NULL;
+		}
+	}
+
+	cfqd->dispatch_slice = 0;
+}
+
+static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+
+{
+	WARN_ON(!RB_EMPTY(&cfqq->sort_list));
+	WARN_ON(cfqq != cfqd->active_queue);
+
+	/*
+	 * idle is disabled, either manually or by past process history
+	 */
+	if (!cfqd->cfq_slice_idle)
+		return 0;
+	if (!cfqq->idle_window)
+		return 0;
+	/*
+	 * task has exited, don't wait
+	 */
+	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
+		return 0;
+
+	cfqq->wait_request = 1;
+	cfqq->must_alloc = 1;
+
+	if (!timer_pending(&cfqd->idle_slice_timer)) {
+		unsigned long slice_left = cfqq->slice_end - 1;
+
+		cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left);
+		add_timer(&cfqd->idle_slice_timer);
+	}
+
+	return 1;
 }
 
 /*
@@ -738,31 +890,39 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 	struct request *__rq;
 	sector_t last;
 
-	cfq_del_crq_rb(crq);
-	cfq_remove_merge_hints(q, crq);
 	list_del(&crq->request->queuelist);
 
 	last = cfqd->last_sector;
-	while ((entry = entry->prev) != head) {
-		__rq = list_entry_rq(entry);
+	list_for_each_entry_reverse(__rq, head, queuelist) {
+		struct cfq_rq *__crq = RQ_DATA(__rq);
 
-		if (blk_barrier_rq(crq->request))
+		if (blk_barrier_rq(__rq))
+			break;
+		if (!blk_fs_request(__rq))
 			break;
-		if (!blk_fs_request(crq->request))
+		if (__crq->requeued)
 			break;
 
-		if (crq->request->sector > __rq->sector)
+		if (__rq->sector <= crq->request->sector)
 			break;
 		if (__rq->sector > last && crq->request->sector < last) {
-			last = crq->request->sector;
+			last = crq->request->sector + crq->request->nr_sectors;
 			break;
 		}
+		entry = &__rq->queuelist;
 	}
 
 	cfqd->last_sector = last;
+
+	cfqq->next_crq = cfq_find_next_crq(cfqd, cfqq, crq);
+
+	cfq_del_crq_rb(crq);
+	cfq_remove_merge_hints(q, crq);
+
 	crq->in_flight = 1;
+	crq->requeued = 0;
 	cfqq->in_flight++;
-	list_add(&crq->request->queuelist, entry);
+	list_add_tail(&crq->request->queuelist, entry);
 }
 
 /*
@@ -771,105 +931,176 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
-	const int reads = !list_empty(&cfqq->fifo[0]);
-	const int writes = !list_empty(&cfqq->fifo[1]);
-	unsigned long now = jiffies;
+	struct request *rq;
 	struct cfq_rq *crq;
 
-	if (time_before(now, cfqq->last_fifo_expire + cfqd->cfq_fifo_batch_expire))
+	if (cfqq->fifo_expire)
 		return NULL;
 
-	crq = RQ_DATA(list_entry(cfqq->fifo[0].next, struct request, queuelist));
-	if (reads && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_r)) {
-		cfqq->last_fifo_expire = now;
-		return crq;
-	}
+	if (!list_empty(&cfqq->fifo)) {
+		int fifo = cfq_cfqq_sync(cfqq);
 
-	crq = RQ_DATA(list_entry(cfqq->fifo[1].next, struct request, queuelist));
-	if (writes && time_after(now, crq->queue_start + cfqd->cfq_fifo_expire_w)) {
-		cfqq->last_fifo_expire = now;
-		return crq;
+		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
+		rq = crq->request;
+		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
+			cfqq->fifo_expire = 1;
+			return crq;
+		}
 	}
 
 	return NULL;
 }
 
 /*
- * dispatch a single request from given queue
+ * Scale schedule slice based on io priority
  */
+static inline int
+cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_slice = cfqd->cfq_slice[cfq_cfqq_sync(cfqq)];
+
+	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
+
+	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - cfqq->ioprio));
+}
+
 static inline void
-cfq_dispatch_request(request_queue_t *q, struct cfq_data *cfqd,
-		     struct cfq_queue *cfqq)
+cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	struct cfq_rq *crq;
+	cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies;
+}
 
-	/*
-	 * follow expired path, else get first next available
-	 */
-	if ((crq = cfq_check_fifo(cfqq)) == NULL) {
-		if (cfqd->find_best_crq)
-			crq = cfqq->next_crq;
-		else
-			crq = rb_entry_crq(rb_first(&cfqq->sort_list));
-	}
+static inline int
+cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	const int base_rq = cfqd->cfq_slice_async_rq;
 
-	cfqd->last_sector = crq->request->sector + crq->request->nr_sectors;
+	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 
-	/*
-	 * finally, insert request into driver list
-	 */
-	cfq_dispatch_sort(q, crq);
+	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
-static int cfq_dispatch_requests(request_queue_t *q, int max_dispatch)
+/*
+ * get next queue for service
+ */
+static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
+	unsigned long now = jiffies;
 	struct cfq_queue *cfqq;
-	struct list_head *entry, *tmp;
-	int queued, busy_queues, first_round;
 
-	if (list_empty(&cfqd->rr_list))
-		return 0;
+	cfqq = cfqd->active_queue;
+	if (!cfqq)
+		goto new_queue;
 
-	queued = 0;
-	first_round = 1;
-restart:
-	busy_queues = 0;
-	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
-		cfqq = list_entry_cfqq(entry);
+	/*
+	 * slice has expired
+	 */
+	if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end))
+		goto new_queue;
 
-		BUG_ON(RB_EMPTY(&cfqq->sort_list));
+	/*
+	 * if queue has requests, dispatch one. if not, check if
+	 * enough slice is left to wait for one
+	 */
+	if (!RB_EMPTY(&cfqq->sort_list))
+		goto keep_queue;
+	else if (!force && cfq_cfqq_sync(cfqq) &&
+		 time_before(now, cfqq->slice_end)) {
+		if (cfq_arm_slice_timer(cfqd, cfqq))
+			return NULL;
+	}
+
+new_queue:
+	cfq_slice_expired(cfqd, 0);
+	cfq_set_active_queue(cfqd);
+keep_queue:
+	return cfqd->active_queue;
+}
+
+static int
+__cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+			int max_dispatch)
+{
+	int dispatched = 0;
+
+	BUG_ON(RB_EMPTY(&cfqq->sort_list));
+
+	do {
+		struct cfq_rq *crq;
 
 		/*
-		 * first round of queueing, only select from queues that
-		 * don't already have io in-flight
+		 * follow expired path, else get first next available
 		 */
-		if (first_round && cfqq->in_flight)
-			continue;
+		if ((crq = cfq_check_fifo(cfqq)) == NULL)
+			crq = cfqq->next_crq;
+
+		/*
+		 * finally, insert request into driver dispatch list
+		 */
+		cfq_dispatch_sort(cfqd->queue, crq);
 
-		cfq_dispatch_request(q, cfqd, cfqq);
+		cfqd->dispatch_slice++;
+		dispatched++;
 
-		if (!RB_EMPTY(&cfqq->sort_list))
-			busy_queues++;
+		if (!cfqd->active_cic) {
+			atomic_inc(&crq->io_context->ioc->refcount);
+			cfqd->active_cic = crq->io_context;
+		}
 
-		queued++;
-	}
+		if (RB_EMPTY(&cfqq->sort_list))
+			break;
+
+	} while (dispatched < max_dispatch);
+
+	/*
+	 * if slice end isn't set yet, set it. if at least one request was
+	 * sync, use the sync time slice value
+	 */
+	if (!cfqq->slice_end)
+		cfq_set_prio_slice(cfqd, cfqq);
+
+	/*
+	 * expire an async queue immediately if it has used up its slice. idle
+	 * queue always expire after 1 dispatch round.
+	 */
+	if ((!cfq_cfqq_sync(cfqq) &&
+	    cfqd->dispatch_slice >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
+	    cfq_class_idle(cfqq))
+		cfq_slice_expired(cfqd, 0);
+
+	return dispatched;
+}
+
+static int
+cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct cfq_queue *cfqq;
+
+	if (!cfqd->busy_queues)
+		return 0;
+
+	cfqq = cfq_select_queue(cfqd, force);
+	if (cfqq) {
+		cfqq->wait_request = 0;
+		cfqq->must_dispatch = 0;
+		del_timer(&cfqd->idle_slice_timer);
+
+		if (cfq_class_idle(cfqq))
+			max_dispatch = 1;
 
-	if ((queued < max_dispatch) && (busy_queues || first_round)) {
-		first_round = 0;
-		goto restart;
+		return __cfq_dispatch_requests(cfqd, cfqq, max_dispatch);
 	}
 
-	return queued;
+	return 0;
 }
 
 static inline void cfq_account_dispatch(struct cfq_rq *crq)
 {
 	struct cfq_queue *cfqq = crq->cfq_queue;
 	struct cfq_data *cfqd = cfqq->cfqd;
-	unsigned long now, elapsed;
 
-	if (!blk_fs_request(crq->request))
+	if (unlikely(!blk_fs_request(crq->request)))
 		return;
 
 	/*
@@ -879,65 +1110,34 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq)
 	if (crq->accounted)
 		return;
 
-	now = jiffies;
-	if (cfqq->service_start == ~0UL)
-		cfqq->service_start = now;
-
-	/*
-	 * on drives with tagged command queueing, command turn-around time
-	 * doesn't necessarily reflect the time spent processing this very
-	 * command inside the drive. so do the accounting differently there,
-	 * by just sorting on the number of requests
-	 */
-	if (cfqd->cfq_tagged) {
-		if (time_after(now, cfqq->service_start + cfq_service)) {
-			cfqq->service_start = now;
-			cfqq->service_used /= 10;
-		}
-
-		cfqq->service_used++;
-		cfq_sort_rr_list(cfqq, 0);
-	}
-
-	elapsed = now - crq->queue_start;
-	if (elapsed > max_elapsed_dispatch)
-		max_elapsed_dispatch = elapsed;
-
 	crq->accounted = 1;
-	crq->service_start = now;
-
-	if (++cfqd->rq_in_driver >= CFQ_MAX_TAG && !cfqd->cfq_tagged) {
-		cfqq->cfqd->cfq_tagged = 1;
-		printk("cfq: depth %d reached, tagging now on\n", CFQ_MAX_TAG);
-	}
+	cfqd->rq_in_driver++;
 }
 
 static inline void
 cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
+	unsigned long now;
 
 	if (!crq->accounted)
 		return;
 
+	now = jiffies;
+
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
 
-	if (!cfqd->cfq_tagged) {
-		unsigned long now = jiffies;
-		unsigned long duration = now - crq->service_start;
+	if (!cfq_class_idle(cfqq))
+		cfqd->last_end_request = now;
 
-		if (time_after(now, cfqq->service_start + cfq_service)) {
-			cfqq->service_start = now;
-			cfqq->service_used >>= 3;
-		}
-
-		cfqq->service_used += duration;
-		cfq_sort_rr_list(cfqq, 0);
-
-		if (duration > max_elapsed_crq)
-			max_elapsed_crq = duration;
+	if (!cfqq->in_flight && cfqq->on_rr) {
+		cfqq->service_last = now;
+		cfq_resort_rr_list(cfqq, 0);
 	}
+
+	if (crq->is_sync)
+		crq->io_context->last_end_request = now;
 }
 
 static struct request *cfq_next_request(request_queue_t *q)
@@ -950,7 +1150,15 @@ static struct request *cfq_next_request(request_queue_t *q)
 dispatch:
 		rq = list_entry_rq(q->queue_head.next);
 
-		if ((crq = RQ_DATA(rq)) != NULL) {
+		crq = RQ_DATA(rq);
+		if (crq) {
+			/*
+			 * if idle window is disabled, allow queue buildup
+			 */
+			if (!crq->in_flight && !crq->cfq_queue->idle_window &&
+			    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
+				return NULL;
+
 			cfq_remove_merge_hints(q, crq);
 			cfq_account_dispatch(crq);
 		}
@@ -958,7 +1166,7 @@ dispatch:
 		return rq;
 	}
 
-	if (cfq_dispatch_requests(q, cfqd->cfq_quantum))
+	if (cfq_dispatch_requests(q, cfqd->cfq_quantum, 0))
 		goto dispatch;
 
 	return NULL;
@@ -972,14 +1180,22 @@ dispatch:
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
-	BUG_ON(!atomic_read(&cfqq->ref));
+	struct cfq_data *cfqd = cfqq->cfqd;
+
+	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 
 	if (!atomic_dec_and_test(&cfqq->ref))
 		return;
 
 	BUG_ON(rb_first(&cfqq->sort_list));
+	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	BUG_ON(cfqq->on_rr);
 
+	if (unlikely(cfqd->active_queue == cfqq)) {
+		cfq_slice_expired(cfqd, 0);
+		kblockd_schedule_work(&cfqd->unplug_work);
+	}
+
 	cfq_put_cfqd(cfqq->cfqd);
 
 	/*
@@ -991,7 +1207,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 }
 
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 	struct hlist_node *entry, *next;
@@ -1007,94 +1223,220 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key, const int hashval)
 }
 
 static struct cfq_queue *
-cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned long key)
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key)
 {
 	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
-static inline void
-cfq_rehash_cfqq(struct cfq_data *cfqd, struct cfq_queue **cfqq,
-		struct cfq_io_context *cic)
+static void cfq_free_io_context(struct cfq_io_context *cic)
 {
-	unsigned long hashkey = cfq_hash_key(cfqd, current);
-	unsigned long hashval = hash_long(hashkey, CFQ_QHASH_SHIFT);
-	struct cfq_queue *__cfqq;
-	unsigned long flags;
+	struct cfq_io_context *__cic;
+	struct list_head *entry, *next;
 
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
-	hlist_del(&(*cfqq)->cfq_hash);
-
-	__cfqq = __cfq_find_cfq_hash(cfqd, hashkey, hashval);
-	if (!__cfqq || __cfqq == *cfqq) {
-		__cfqq = *cfqq;
-		hlist_add_head(&__cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-		__cfqq->key_type = cfqd->key_type;
-	} else {
-		atomic_inc(&__cfqq->ref);
-		cic->cfqq = __cfqq;
-		cfq_put_queue(*cfqq);
-		*cfqq = __cfqq;
+	list_for_each_safe(entry, next, &cic->list) {
+		__cic = list_entry(entry, struct cfq_io_context, list);
+		kmem_cache_free(cfq_ioc_pool, __cic);
 	}
 
-	cic->cfqq = __cfqq;
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+	kmem_cache_free(cfq_ioc_pool, cic);
 }
 
-static void cfq_free_io_context(struct cfq_io_context *cic)
+/*
+ * Called with interrupts disabled
+ */
+static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 {
-	kmem_cache_free(cfq_ioc_pool, cic);
+	struct cfq_data *cfqd = cic->cfqq->cfqd;
+	request_queue_t *q = cfqd->queue;
+
+	WARN_ON(!irqs_disabled());
+
+	spin_lock(q->queue_lock);
+
+	if (unlikely(cic->cfqq == cfqd->active_queue)) {
+		cfq_slice_expired(cfqd, 0);
+		kblockd_schedule_work(&cfqd->unplug_work);
+	}
+
+	cfq_put_queue(cic->cfqq);
+	cic->cfqq = NULL;
+	spin_unlock(q->queue_lock);
 }
 
 /*
- * locking hierarchy is: io_context lock -> queue locks
+ * Another task may update the task cic list, if it is doing a queue lookup
+ * on its behalf. cfq_cic_lock excludes such concurrent updates
  */
 static void cfq_exit_io_context(struct cfq_io_context *cic)
 {
-	struct cfq_queue *cfqq = cic->cfqq;
-	struct list_head *entry = &cic->list;
-	request_queue_t *q;
+	struct cfq_io_context *__cic;
+	struct list_head *entry;
 	unsigned long flags;
 
+	local_irq_save(flags);
+
 	/*
 	 * put the reference this task is holding to the various queues
 	 */
-	spin_lock_irqsave(&cic->ioc->lock, flags);
-	while ((entry = cic->list.next) != &cic->list) {
-		struct cfq_io_context *__cic;
-
+	list_for_each(entry, &cic->list) {
 		__cic = list_entry(entry, struct cfq_io_context, list);
-		list_del(entry);
-
-		q = __cic->cfqq->cfqd->queue;
-		spin_lock(q->queue_lock);
-		cfq_put_queue(__cic->cfqq);
-		spin_unlock(q->queue_lock);
+		cfq_exit_single_io_context(__cic);
 	}
 
-	q = cfqq->cfqd->queue;
-	spin_lock(q->queue_lock);
-	cfq_put_queue(cfqq);
-	spin_unlock(q->queue_lock);
-
-	cic->cfqq = NULL;
-	spin_unlock_irqrestore(&cic->ioc->lock, flags);
+	cfq_exit_single_io_context(cic);
+	local_irq_restore(flags);
 }
 
-static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
+static struct cfq_io_context *
+cfq_alloc_io_context(struct cfq_data *cfqd, int gfp_mask)
 {
-	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_flags);
+	struct cfq_io_context *cic = kmem_cache_alloc(cfq_ioc_pool, gfp_mask);
 
 	if (cic) {
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
 		INIT_LIST_HEAD(&cic->list);
 		cic->cfqq = NULL;
+		cic->key = NULL;
+		cic->last_end_request = jiffies;
+		cic->ttime_total = 0;
+		cic->ttime_samples = 0;
+		cic->ttime_mean = 0;
+		cic->dtor = cfq_free_io_context;
+		cic->exit = cfq_exit_io_context;
 	}
 
 	return cic;
 }
 
+static void cfq_init_prio_data(struct cfq_queue *cfqq)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+
+	if (!cfqq->prio_changed)
+		return;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
+	switch (ioprio_class) {
+		default:
+			printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
+		case IOPRIO_CLASS_NONE:
+			/*
+			 * no prio set, place us in the middle of the BE classes
+			 */
+			cfqq->ioprio = task_nice_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_RT:
+			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_RT;
+			break;
+		case IOPRIO_CLASS_BE:
+			cfqq->ioprio = task_ioprio(tsk);
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+			break;
+		case IOPRIO_CLASS_IDLE:
+			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
+			cfqq->ioprio = 7;
+			cfqq->idle_window = 0;
+			break;
+	}
+
+	/*
+	 * keep track of original prio settings in case we have to temporarily
+	 * elevate the priority of this queue
+	 */
+	cfqq->org_ioprio = cfqq->ioprio;
+	cfqq->org_ioprio_class = cfqq->ioprio_class;
+
+	if (cfqq->on_rr)
+		cfq_resort_rr_list(cfqq, 0);
+
+	cfqq->prio_changed = 0;
+}
+
+static inline void changed_ioprio(struct cfq_queue *cfqq)
+{
+	if (cfqq) {
+		struct cfq_data *cfqd = cfqq->cfqd;
+
+		spin_lock(cfqd->queue->queue_lock);
+		cfqq->prio_changed = 1;
+		cfq_init_prio_data(cfqq);
+		spin_unlock(cfqd->queue->queue_lock);
+	}
+}
+
+/*
+ * callback from sys_ioprio_set, irqs are disabled
+ */
+static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
+{
+	struct cfq_io_context *cic = ioc->cic;
+
+	changed_ioprio(cic->cfqq);
+
+	list_for_each_entry(cic, &cic->list, list)
+		changed_ioprio(cic->cfqq);
+
+	return 0;
+}
+
+static struct cfq_queue *
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask)
+{
+	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
+	struct cfq_queue *cfqq, *new_cfqq = NULL;
+
+retry:
+	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+
+	if (!cfqq) {
+		if (new_cfqq) {
+			cfqq = new_cfqq;
+			new_cfqq = NULL;
+		} else if (gfp_mask & __GFP_WAIT) {
+			spin_unlock_irq(cfqd->queue->queue_lock);
+			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			spin_lock_irq(cfqd->queue->queue_lock);
+			goto retry;
+		} else {
+			cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
+			if (!cfqq)
+				goto out;
+		}
+
+		memset(cfqq, 0, sizeof(*cfqq));
+
+		INIT_HLIST_NODE(&cfqq->cfq_hash);
+		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_ROOT(&cfqq->sort_list);
+		INIT_LIST_HEAD(&cfqq->fifo);
+
+		cfqq->key = key;
+		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+		atomic_set(&cfqq->ref, 0);
+		cfqq->cfqd = cfqd;
+		atomic_inc(&cfqd->ref);
+		cfqq->service_last = 0;
+		/*
+		 * set ->slice_left to allow preemption for a new process
+		 */
+		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
+		cfqq->idle_window = 1;
+		cfqq->ioprio = -1;
+		cfqq->ioprio_class = -1;
+		cfqq->prio_changed = 1;
+	}
+
+	if (new_cfqq)
+		kmem_cache_free(cfq_pool, new_cfqq);
+
+	atomic_inc(&cfqq->ref);
+out:
+	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
+	return cfqq;
+}
+
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
@@ -1102,39 +1444,39 @@ static struct cfq_io_context *cfq_alloc_io_context(int gfp_flags)
  * cfqq, so we don't need to worry about it disappearing
  */
 static struct cfq_io_context *
-cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
+cfq_get_io_context(struct cfq_data *cfqd, pid_t pid, int gfp_mask)
 {
-	struct cfq_data *cfqd = (*cfqq)->cfqd;
-	struct cfq_queue *__cfqq = *cfqq;
+	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
-	struct io_context *ioc;
 
-	might_sleep_if(gfp_flags & __GFP_WAIT);
+	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	ioc = get_io_context(gfp_flags);
+	ioc = get_io_context(gfp_mask);
 	if (!ioc)
 		return NULL;
 
 	if ((cic = ioc->cic) == NULL) {
-		cic = cfq_alloc_io_context(gfp_flags);
+		cic = cfq_alloc_io_context(cfqd, gfp_mask);
 
 		if (cic == NULL)
 			goto err;
 
+		/*
+		 * manually increment generic io_context usage count, it
+		 * cannot go away since we are already holding one ref to it
+		 */
 		ioc->cic = cic;
+		ioc->set_ioprio = cfq_ioc_set_ioprio;
 		cic->ioc = ioc;
-		cic->cfqq = __cfqq;
-		atomic_inc(&__cfqq->ref);
+		cic->key = cfqd;
+		atomic_inc(&cfqd->ref);
 	} else {
 		struct cfq_io_context *__cic;
-		unsigned long flags;
 
 		/*
-		 * since the first cic on the list is actually the head
-		 * itself, need to check this here or we'll duplicate an
-		 * cic per ioc for no reason
+		 * the first cic on the list is actually the head itself
 		 */
-		if (cic->cfqq == __cfqq)
+		if (cic->key == cfqd)
 			goto out;
 
 		/*
@@ -1142,152 +1484,259 @@ cfq_get_io_context(struct cfq_queue **cfqq, int gfp_flags)
 		 * should be ok here, the list will usually not be more than
 		 * 1 or a few entries long
 		 */
-		spin_lock_irqsave(&ioc->lock, flags);
 		list_for_each_entry(__cic, &cic->list, list) {
 			/*
 			 * this process is already holding a reference to
 			 * this queue, so no need to get one more
 			 */
-			if (__cic->cfqq == __cfqq) {
+			if (__cic->key == cfqd) {
 				cic = __cic;
-				spin_unlock_irqrestore(&ioc->lock, flags);
 				goto out;
 			}
 		}
-		spin_unlock_irqrestore(&ioc->lock, flags);
 
 		/*
 		 * nope, process doesn't have a cic assoicated with this
 		 * cfqq yet. get a new one and add to list
 		 */
-		__cic = cfq_alloc_io_context(gfp_flags);
+		__cic = cfq_alloc_io_context(cfqd, gfp_mask);
 		if (__cic == NULL)
 			goto err;
 
 		__cic->ioc = ioc;
-		__cic->cfqq = __cfqq;
-		atomic_inc(&__cfqq->ref);
-		spin_lock_irqsave(&ioc->lock, flags);
+		__cic->key = cfqd;
+		atomic_inc(&cfqd->ref);
 		list_add(&__cic->list, &cic->list);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-
 		cic = __cic;
-		*cfqq = __cfqq;
 	}
 
 out:
-	/*
-	 * if key_type has been changed on the fly, we lazily rehash
-	 * each queue at lookup time
-	 */
-	if ((*cfqq)->key_type != cfqd->key_type)
-		cfq_rehash_cfqq(cfqd, cfqq, cic);
-
 	return cic;
 err:
 	put_io_context(ioc);
 	return NULL;
 }
 
-static struct cfq_queue *
-__cfq_get_queue(struct cfq_data *cfqd, unsigned long key, int gfp_mask)
+static void
+cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
-	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	unsigned long elapsed, ttime;
 
-retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+	/*
+	 * if this context already has stuff queued, thinktime is from
+	 * last queue not last end
+	 */
+#if 0
+	if (time_after(cic->last_end_request, cic->last_queue))
+		elapsed = jiffies - cic->last_end_request;
+	else
+		elapsed = jiffies - cic->last_queue;
+#else
+		elapsed = jiffies - cic->last_end_request;
+#endif
 
-	if (!cfqq) {
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else {
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc(cfq_pool, gfp_mask);
-			spin_lock_irq(cfqd->queue->queue_lock);
+	ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
 
-			if (!new_cfqq && !(gfp_mask & __GFP_WAIT))
-				goto out;
-
-			goto retry;
-		}
+	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
+	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
+	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
+}
 
-		memset(cfqq, 0, sizeof(*cfqq));
+#define sample_valid(samples)	((samples) > 80)
 
-		INIT_HLIST_NODE(&cfqq->cfq_hash);
-		INIT_LIST_HEAD(&cfqq->cfq_list);
-		RB_CLEAR_ROOT(&cfqq->sort_list);
-		INIT_LIST_HEAD(&cfqq->fifo[0]);
-		INIT_LIST_HEAD(&cfqq->fifo[1]);
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter
+ */
+static void
+cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		       struct cfq_io_context *cic)
+{
+	int enable_idle = cfqq->idle_window;
 
-		cfqq->key = key;
-		hlist_add_head(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
-		atomic_set(&cfqq->ref, 0);
-		cfqq->cfqd = cfqd;
-		atomic_inc(&cfqd->ref);
-		cfqq->key_type = cfqd->key_type;
-		cfqq->service_start = ~0UL;
+	if (!cic->ioc->task || !cfqd->cfq_slice_idle)
+		enable_idle = 0;
+	else if (sample_valid(cic->ttime_samples)) {
+		if (cic->ttime_mean > cfqd->cfq_slice_idle)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
 	}
 
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
+	cfqq->idle_window = enable_idle;
+}
 
-	atomic_inc(&cfqq->ref);
-out:
-	WARN_ON((gfp_mask & __GFP_WAIT) && !cfqq);
-	return cfqq;
+
+/*
+ * Check if new_cfqq should preempt the currently active queue. Return 0 for
+ * no or if we aren't sure, a 1 will cause a preempt.
+ */
+static int
+cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
+		   struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfq_class_idle(new_cfqq))
+		return 0;
+
+	if (!cfqq)
+		return 1;
+
+	if (cfq_class_idle(cfqq))
+		return 1;
+	if (!new_cfqq->wait_request)
+		return 0;
+	/*
+	 * if it doesn't have slice left, forget it
+	 */
+	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
+		return 0;
+	if (crq->is_sync && !cfq_cfqq_sync(cfqq))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * cfqq preempts the active queue. if we allowed preempt with no slice left,
+ * let it have half of its nominal slice.
+ */
+static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	struct cfq_queue *__cfqq, *next;
+
+	list_for_each_entry_safe(__cfqq, next, &cfqd->cur_rr, cfq_list)
+		cfq_resort_rr_list(__cfqq, 1);
+
+	if (!cfqq->slice_left)
+		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
+
+	cfqq->slice_end = cfqq->slice_left + jiffies;
+	cfq_slice_expired(cfqd, 1);
+	__cfq_set_active_queue(cfqd, cfqq);
+}
+
+/*
+ * should really be a ll_rw_blk.c helper
+ */
+static void cfq_start_queueing(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	request_queue_t *q = cfqd->queue;
+
+	if (!blk_queue_plugged(q))
+		q->request_fn(q);
+	else
+		__generic_unplug_device(q);
+}
+
+/*
+ * Called when a new fs request (crq) is added (to cfqq). Check if there's
+ * something we should do about it
+ */
+static void
+cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		 struct cfq_rq *crq)
+{
+	const int sync = crq->is_sync;
+
+	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
+
+	if (sync) {
+		struct cfq_io_context *cic = crq->io_context;
+
+		cfq_update_io_thinktime(cfqd, cic);
+		cfq_update_idle_window(cfqd, cfqq, cic);
+
+		cic->last_queue = jiffies;
+	}
+
+	if (cfqq == cfqd->active_queue) {
+		/*
+		 * if we are waiting for a request for this queue, let it rip
+		 * immediately and flag that we must not expire this queue
+		 * just now
+		 */
+		if (cfqq->wait_request) {
+			cfqq->must_dispatch = 1;
+			del_timer(&cfqd->idle_slice_timer);
+			cfq_start_queueing(cfqd, cfqq);
+		}
+	} else if (cfq_should_preempt(cfqd, cfqq, crq)) {
+		/*
+		 * not the active queue - expire current slice if it is
+		 * idle and has expired it's mean thinktime or this new queue
+		 * has some old slice time left and is of higher priority
+		 */
+		cfq_preempt_queue(cfqd, cfqq);
+		cfqq->must_dispatch = 1;
+		cfq_start_queueing(cfqd, cfqq);
+	}
 }
 
-static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
+static void cfq_enqueue(struct cfq_data *cfqd, struct request *rq)
 {
-	crq->is_sync = 0;
-	if (rq_data_dir(crq->request) == READ || current->flags & PF_SYNCWRITE)
-		crq->is_sync = 1;
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct cfq_queue *cfqq = crq->cfq_queue;
+
+	cfq_init_prio_data(cfqq);
 
 	cfq_add_crq_rb(crq);
-	crq->queue_start = jiffies;
 
-	list_add_tail(&crq->request->queuelist, &crq->cfq_queue->fifo[crq->is_sync]);
+	list_add_tail(&rq->queuelist, &cfqq->fifo);
+
+	if (rq_mergeable(rq)) {
+		cfq_add_crq_hash(cfqd, crq);
+
+		if (!cfqd->queue->last_merge)
+			cfqd->queue->last_merge = rq;
+	}
+
+	cfq_crq_enqueued(cfqd, cfqq, crq);
 }
 
 static void
 cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_rq *crq = RQ_DATA(rq);
 
 	switch (where) {
 		case ELEVATOR_INSERT_BACK:
-			while (cfq_dispatch_requests(q, cfqd->cfq_quantum))
+			while (cfq_dispatch_requests(q, INT_MAX, 1))
 				;
 			list_add_tail(&rq->queuelist, &q->queue_head);
+			/*
+			 * If we were idling with pending requests on
+			 * inactive cfqqs, force dispatching will
+			 * remove the idle timer and the queue won't
+			 * be kicked by __make_request() afterward.
+			 * Kick it here.
+			 */
+			kblockd_schedule_work(&cfqd->unplug_work);
 			break;
 		case ELEVATOR_INSERT_FRONT:
 			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		case ELEVATOR_INSERT_SORT:
 			BUG_ON(!blk_fs_request(rq));
-			cfq_enqueue(cfqd, crq);
+			cfq_enqueue(cfqd, rq);
 			break;
 		default:
 			printk("%s: bad insert point %d\n", __FUNCTION__,where);
 			return;
 	}
+}
 
-	if (rq_mergeable(rq)) {
-		cfq_add_crq_hash(cfqd, crq);
-
-		if (!q->last_merge)
-			q->last_merge = rq;
-	}
+static inline int cfq_pending_requests(struct cfq_data *cfqd)
+{
+	return !list_empty(&cfqd->queue->queue_head) || cfqd->busy_queues;
 }
 
 static int cfq_queue_empty(request_queue_t *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 
-	return list_empty(&q->queue_head) && list_empty(&cfqd->rr_list);
+	return !cfq_pending_requests(cfqd);
 }
 
 static void cfq_completed_request(request_queue_t *q, struct request *rq)
@@ -1332,51 +1781,132 @@ cfq_latter_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-static int cfq_may_queue(request_queue_t *q, int rw)
+/*
+ * we temporarily boost lower priority queues if they are holding fs exclusive
+ * resources. they are boosted to normal prio (CLASS_BE/4)
+ */
+static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
-	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_queue *cfqq;
-	int ret = ELV_MQUEUE_MAY;
+	const int ioprio_class = cfqq->ioprio_class;
+	const int ioprio = cfqq->ioprio;
 
-	if (current->flags & PF_MEMALLOC)
-		return ELV_MQUEUE_MAY;
+	if (has_fs_excl()) {
+		/*
+		 * boost idle prio on transactions that would lock out other
+		 * users of the filesystem
+		 */
+		if (cfq_class_idle(cfqq))
+			cfqq->ioprio_class = IOPRIO_CLASS_BE;
+		if (cfqq->ioprio > IOPRIO_NORM)
+			cfqq->ioprio = IOPRIO_NORM;
+	} else {
+		/*
+		 * check if we need to unboost the queue
+		 */
+		if (cfqq->ioprio_class != cfqq->org_ioprio_class)
+			cfqq->ioprio_class = cfqq->org_ioprio_class;
+		if (cfqq->ioprio != cfqq->org_ioprio)
+			cfqq->ioprio = cfqq->org_ioprio;
+	}
 
-	cfqq = cfq_find_cfq_hash(cfqd, cfq_hash_key(cfqd, current));
-	if (cfqq) {
-		int limit = cfqd->max_queued;
+	/*
+	 * refile between round-robin lists if we moved the priority class
+	 */
+	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
+	    cfqq->on_rr)
+		cfq_resort_rr_list(cfqq, 0);
+}
 
-		if (cfqq->allocated[rw] < cfqd->cfq_queued)
-			return ELV_MQUEUE_MUST;
+static inline pid_t cfq_queue_pid(struct task_struct *task, int rw)
+{
+	if (rw == READ || process_sync(task))
+		return task->pid;
 
-		if (cfqd->busy_queues)
-			limit = q->nr_requests / cfqd->busy_queues;
+	return CFQ_KEY_ASYNC;
+}
 
-		if (limit < cfqd->cfq_queued)
-			limit = cfqd->cfq_queued;
-		else if (limit > cfqd->max_queued)
-			limit = cfqd->max_queued;
+static inline int
+__cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		struct task_struct *task, int rw)
+{
+	if (cfqq->wait_request && cfqq->must_alloc)
+		return ELV_MQUEUE_MUST;
 
-		if (cfqq->allocated[rw] >= limit) {
-			if (limit > cfqq->alloc_limit[rw])
-				cfqq->alloc_limit[rw] = limit;
+	return ELV_MQUEUE_MAY;
+#if 0
+	if (!cfqq || task->flags & PF_MEMALLOC)
+		return ELV_MQUEUE_MAY;
+	if (!cfqq->allocated[rw] || cfqq->must_alloc) {
+		if (cfqq->wait_request)
+			return ELV_MQUEUE_MUST;
 
-			ret = ELV_MQUEUE_NO;
+		/*
+		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
+		 * can quickly flood the queue with writes from a single task
+		 */
+		if (rw == READ || !cfqq->must_alloc_slice) {
+			cfqq->must_alloc_slice = 1;
+			return ELV_MQUEUE_MUST;
 		}
+
+		return ELV_MQUEUE_MAY;
 	}
+	if (cfq_class_idle(cfqq))
+		return ELV_MQUEUE_NO;
+	if (cfqq->allocated[rw] >= cfqd->max_queued) {
+		struct io_context *ioc = get_io_context(GFP_ATOMIC);
+		int ret = ELV_MQUEUE_NO;
 
-	return ret;
+		if (ioc && ioc->nr_batch_requests)
+			ret = ELV_MQUEUE_MAY;
+
+		put_io_context(ioc);
+		return ret;
+	}
+
+	return ELV_MQUEUE_MAY;
+#endif
+}
+
+static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
+{
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct cfq_queue *cfqq;
+
+	/*
+	 * don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be queued.
+	 * so just lookup a possibly existing queue, or return 'may queue'
+	 * if that fails
+	 */
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw));
+	if (cfqq) {
+		cfq_init_prio_data(cfqq);
+		cfq_prio_boost(cfqq);
+
+		return __cfq_may_queue(cfqd, cfqq, tsk, rw);
+	}
+
+	return ELV_MQUEUE_MAY;
 }
 
 static void cfq_check_waiters(request_queue_t *q, struct cfq_queue *cfqq)
 {
+	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request_list *rl = &q->rq;
-	const int write = waitqueue_active(&rl->wait[WRITE]);
-	const int read = waitqueue_active(&rl->wait[READ]);
 
-	if (read && cfqq->allocated[READ] < cfqq->alloc_limit[READ])
-		wake_up(&rl->wait[READ]);
-	if (write && cfqq->allocated[WRITE] < cfqq->alloc_limit[WRITE])
-		wake_up(&rl->wait[WRITE]);
+	if (cfqq->allocated[READ] <= cfqd->max_queued || cfqd->rq_starved) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[READ]))
+			wake_up(&rl->wait[READ]);
+	}
+
+	if (cfqq->allocated[WRITE] <= cfqd->max_queued || cfqd->rq_starved) {
+		smp_mb();
+		if (waitqueue_active(&rl->wait[WRITE]))
+			wake_up(&rl->wait[WRITE]);
+	}
 }
 
 /*
@@ -1389,69 +1919,59 @@ static void cfq_put_request(request_queue_t *q, struct request *rq)
 
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
+		const int rw = rq_data_dir(rq);
 
-		BUG_ON(q->last_merge == rq);
-		BUG_ON(!hlist_unhashed(&crq->hash));
-
-		if (crq->io_context)
-			put_io_context(crq->io_context->ioc);
+		BUG_ON(!cfqq->allocated[rw]);
+		cfqq->allocated[rw]--;
 
-		BUG_ON(!cfqq->allocated[crq->is_write]);
-		cfqq->allocated[crq->is_write]--;
+		put_io_context(crq->io_context->ioc);
 
 		mempool_free(crq, cfqd->crq_pool);
 		rq->elevator_private = NULL;
 
-		smp_mb();
 		cfq_check_waiters(q, cfqq);
 		cfq_put_queue(cfqq);
 	}
 }
 
 /*
- * Allocate cfq data structures associated with this request. A queue and
+ * Allocate cfq data structures associated with this request.
  */
-static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+static int
+cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
-	struct cfq_queue *cfqq, *saved_cfqq;
+	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
+	cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask);
+
 	spin_lock_irqsave(q->queue_lock, flags);
 
-	cfqq = __cfq_get_queue(cfqd, cfq_hash_key(cfqd, current), gfp_mask);
-	if (!cfqq)
-		goto out_lock;
+	if (!cic)
+		goto queue_fail;
+
+	if (!cic->cfqq) {
+		cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask);
+		if (!cfqq)
+			goto queue_fail;
 
-repeat:
-	if (cfqq->allocated[rw] >= cfqd->max_queued)
-		goto out_lock;
+		cic->cfqq = cfqq;
+	} else
+		cfqq = cic->cfqq;
 
 	cfqq->allocated[rw]++;
+	cfqq->must_alloc = 0;
+	cfqd->rq_starved = 0;
+	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	/*
-	 * if hashing type has changed, the cfq_queue might change here.
-	 */
-	saved_cfqq = cfqq;
-	cic = cfq_get_io_context(&cfqq, gfp_mask);
-	if (!cic)
-		goto err;
-
-	/*
-	 * repeat allocation checks on queue change
-	 */
-	if (unlikely(saved_cfqq != cfqq)) {
-		spin_lock_irqsave(q->queue_lock, flags);
-		saved_cfqq->allocated[rw]--;
-		goto repeat;
-	}
-
 	crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
 	if (crq) {
 		RB_CLEAR(&crq->rb_node);
@@ -1460,24 +1980,130 @@ repeat:
 		INIT_HLIST_NODE(&crq->hash);
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
-		crq->service_start = crq->queue_start = 0;
-		crq->in_flight = crq->accounted = crq->is_sync = 0;
-		crq->is_write = rw;
+		crq->in_flight = crq->accounted = 0;
+		crq->is_sync = (rw == READ || process_sync(current));
+		crq->requeued = 0;
 		rq->elevator_private = crq;
-		cfqq->alloc_limit[rw] = 0;
 		return 0;
 	}
 
-	put_io_context(cic->ioc);
-err:
 	spin_lock_irqsave(q->queue_lock, flags);
 	cfqq->allocated[rw]--;
+	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
+		cfqq->must_alloc = 1;
 	cfq_put_queue(cfqq);
-out_lock:
+queue_fail:
+	if (cic)
+		put_io_context(cic->ioc);
+	/*
+	 * mark us rq allocation starved. we need to kickstart the process
+	 * ourselves if there are no pending requests that can do it for us.
+	 * that would be an extremely rare OOM situation
+	 */
+	cfqd->rq_starved = 1;
+	kblockd_schedule_work(&cfqd->unplug_work);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
 
+static void cfq_kick_queue(void *data)
+{
+	request_queue_t *q = data;
+	struct cfq_data *cfqd = q->elevator->elevator_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (cfqd->rq_starved) {
+		struct request_list *rl = &q->rq;
+
+		/*
+		 * we aren't guaranteed to get a request after this, but we
+		 * have to be opportunistic
+		 */
+		smp_mb();
+		if (waitqueue_active(&rl->wait[READ]))
+			wake_up(&rl->wait[READ]);
+		if (waitqueue_active(&rl->wait[WRITE]))
+			wake_up(&rl->wait[WRITE]);
+	}
+
+	blk_remove_plug(q);
+	q->request_fn(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Timer running if the active_queue is currently idling inside its time slice
+ */
+static void cfq_idle_slice_timer(unsigned long data)
+{
+	struct cfq_data *cfqd = (struct cfq_data *) data;
+	struct cfq_queue *cfqq;
+	unsigned long flags;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	if ((cfqq = cfqd->active_queue) != NULL) {
+		unsigned long now = jiffies;
+
+		/*
+		 * expired
+		 */
+		if (time_after(now, cfqq->slice_end))
+			goto expire;
+
+		/*
+		 * only expire and reinvoke request handler, if there are
+		 * other queues with pending requests
+		 */
+		if (!cfq_pending_requests(cfqd)) {
+			cfqd->idle_slice_timer.expires = min(now + cfqd->cfq_slice_idle, cfqq->slice_end);
+			add_timer(&cfqd->idle_slice_timer);
+			goto out_cont;
+		}
+
+		/*
+		 * not expired and it has a request pending, let it dispatch
+		 */
+		if (!RB_EMPTY(&cfqq->sort_list)) {
+			cfqq->must_dispatch = 1;
+			goto out_kick;
+		}
+	}
+expire:
+	cfq_slice_expired(cfqd, 0);
+out_kick:
+	if (cfq_pending_requests(cfqd))
+		kblockd_schedule_work(&cfqd->unplug_work);
+out_cont:
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+/*
+ * Timer running if an idle class queue is waiting for service
+ */
+static void cfq_idle_class_timer(unsigned long data)
+{
+	struct cfq_data *cfqd = (struct cfq_data *) data;
+	unsigned long flags, end;
+
+	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
+
+	/*
+	 * race with a non-idle queue, reset timer
+	 */
+	end = cfqd->last_end_request + CFQ_IDLE_GRACE;
+	if (!time_after_eq(jiffies, end)) {
+		cfqd->idle_class_timer.expires = end;
+		add_timer(&cfqd->idle_class_timer);
+	} else
+		kblockd_schedule_work(&cfqd->unplug_work);
+
+	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
+}
+
+
 static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
 	request_queue_t *q = cfqd->queue;
@@ -1485,6 +2111,8 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	if (!atomic_dec_and_test(&cfqd->ref))
 		return;
 
+	blk_sync_queue(q);
+
 	blk_put_queue(q);
 
 	mempool_destroy(cfqd->crq_pool);
@@ -1495,7 +2123,11 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 
 static void cfq_exit_queue(elevator_t *e)
 {
-	cfq_put_cfqd(e->elevator_data);
+	struct cfq_data *cfqd = e->elevator_data;
+
+	del_timer_sync(&cfqd->idle_slice_timer);
+	del_timer_sync(&cfqd->idle_class_timer);
+	cfq_put_cfqd(cfqd);
 }
 
 static int cfq_init_queue(request_queue_t *q, elevator_t *e)
@@ -1508,7 +2140,13 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 		return -ENOMEM;
 
 	memset(cfqd, 0, sizeof(*cfqd));
-	INIT_LIST_HEAD(&cfqd->rr_list);
+
+	for (i = 0; i < CFQ_PRIO_LISTS; i++)
+		INIT_LIST_HEAD(&cfqd->rr_list[i]);
+
+	INIT_LIST_HEAD(&cfqd->busy_rr);
+	INIT_LIST_HEAD(&cfqd->cur_rr);
+	INIT_LIST_HEAD(&cfqd->idle_rr);
 	INIT_LIST_HEAD(&cfqd->empty_list);
 
 	cfqd->crq_hash = kmalloc(sizeof(struct hlist_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
@@ -1533,25 +2171,32 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->queue = q;
 	atomic_inc(&q->refcnt);
 
-	/*
-	 * just set it to some high value, we want anyone to be able to queue
-	 * some requests. fairness is handled differently
-	 */
-	q->nr_requests = 1024;
-	cfqd->max_queued = q->nr_requests / 16;
+	cfqd->max_queued = q->nr_requests / 4;
 	q->nr_batching = cfq_queued;
-	cfqd->key_type = CFQ_KEY_TGID;
-	cfqd->find_best_crq = 1;
+
+	init_timer(&cfqd->idle_slice_timer);
+	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
+	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
+
+	init_timer(&cfqd->idle_class_timer);
+	cfqd->idle_class_timer.function = cfq_idle_class_timer;
+	cfqd->idle_class_timer.data = (unsigned long) cfqd;
+
+	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue, q);
+
 	atomic_set(&cfqd->ref, 1);
 
 	cfqd->cfq_queued = cfq_queued;
 	cfqd->cfq_quantum = cfq_quantum;
-	cfqd->cfq_fifo_expire_r = cfq_fifo_expire_r;
-	cfqd->cfq_fifo_expire_w = cfq_fifo_expire_w;
-	cfqd->cfq_fifo_batch_expire = cfq_fifo_rate;
+	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
+	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
-
+	cfqd->cfq_slice[0] = cfq_slice_async;
+	cfqd->cfq_slice[1] = cfq_slice_sync;
+	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
+	cfqd->cfq_slice_idle = cfq_slice_idle;
+	cfqd->cfq_max_depth = cfq_max_depth;
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -1595,7 +2240,6 @@ fail:
 	return -ENOMEM;
 }
 
-
 /*
  * sysfs parts below -->
  */
@@ -1620,45 +2264,6 @@ cfq_var_store(unsigned int *var, const char *page, size_t count)
 	return count;
 }
 
-static ssize_t
-cfq_clear_elapsed(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	max_elapsed_dispatch = max_elapsed_crq = 0;
-	return count;
-}
-
-static ssize_t
-cfq_set_key_type(struct cfq_data *cfqd, const char *page, size_t count)
-{
-	spin_lock_irq(cfqd->queue->queue_lock);
-	if (!strncmp(page, "pgid", 4))
-		cfqd->key_type = CFQ_KEY_PGID;
-	else if (!strncmp(page, "tgid", 4))
-		cfqd->key_type = CFQ_KEY_TGID;
-	else if (!strncmp(page, "uid", 3))
-		cfqd->key_type = CFQ_KEY_UID;
-	else if (!strncmp(page, "gid", 3))
-		cfqd->key_type = CFQ_KEY_GID;
-	spin_unlock_irq(cfqd->queue->queue_lock);
-	return count;
-}
-
-static ssize_t
-cfq_read_key_type(struct cfq_data *cfqd, char *page)
-{
-	ssize_t len = 0;
-	int i;
-
-	for (i = CFQ_KEY_PGID; i < CFQ_KEY_LAST; i++) {
-		if (cfqd->key_type == i)
-			len += sprintf(page+len, "[%s] ", cfq_key_types[i]);
-		else
-			len += sprintf(page+len, "%s ", cfq_key_types[i]);
-	}
-	len += sprintf(page+len, "\n");
-	return len;
-}
-
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 {									\
@@ -1669,12 +2274,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, char *page)		\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_queued_show, cfqd->cfq_queued, 0);
-SHOW_FUNCTION(cfq_fifo_expire_r_show, cfqd->cfq_fifo_expire_r, 1);
-SHOW_FUNCTION(cfq_fifo_expire_w_show, cfqd->cfq_fifo_expire_w, 1);
-SHOW_FUNCTION(cfq_fifo_batch_expire_show, cfqd->cfq_fifo_batch_expire, 1);
-SHOW_FUNCTION(cfq_find_best_show, cfqd->find_best_crq, 0);
+SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
+SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_penalty_show, cfqd->cfq_back_penalty, 0);
+SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
+SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
+SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
+SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
+SHOW_FUNCTION(cfq_max_depth_show, cfqd->cfq_max_depth, 0);
 #undef SHOW_FUNCTION
 
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
@@ -1694,12 +2302,15 @@ static ssize_t __FUNC(struct cfq_data *cfqd, const char *page, size_t count)	\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_queued_store, &cfqd->cfq_queued, 1, UINT_MAX, 0);
-STORE_FUNCTION(cfq_fifo_expire_r_store, &cfqd->cfq_fifo_expire_r, 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_fifo_expire_w_store, &cfqd->cfq_fifo_expire_w, 1, UINT_MAX, 1);
-STORE_FUNCTION(cfq_fifo_batch_expire_store, &cfqd->cfq_fifo_batch_expire, 0, UINT_MAX, 1);
-STORE_FUNCTION(cfq_find_best_store, &cfqd->find_best_crq, 0, 1, 0);
+STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
+STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0);
+STORE_FUNCTION(cfq_max_depth_store, &cfqd->cfq_max_depth, 1, UINT_MAX, 0);
 #undef STORE_FUNCTION
 
 static struct cfq_fs_entry cfq_quantum_entry = {
@@ -1712,25 +2323,15 @@ static struct cfq_fs_entry cfq_queued_entry = {
 	.show = cfq_queued_show,
 	.store = cfq_queued_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_r_entry = {
+static struct cfq_fs_entry cfq_fifo_expire_sync_entry = {
 	.attr = {.name = "fifo_expire_sync", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_r_show,
-	.store = cfq_fifo_expire_r_store,
+	.show = cfq_fifo_expire_sync_show,
+	.store = cfq_fifo_expire_sync_store,
 };
-static struct cfq_fs_entry cfq_fifo_expire_w_entry = {
+static struct cfq_fs_entry cfq_fifo_expire_async_entry = {
 	.attr = {.name = "fifo_expire_async", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_expire_w_show,
-	.store = cfq_fifo_expire_w_store,
-};
-static struct cfq_fs_entry cfq_fifo_batch_expire_entry = {
-	.attr = {.name = "fifo_batch_expire", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_fifo_batch_expire_show,
-	.store = cfq_fifo_batch_expire_store,
-};
-static struct cfq_fs_entry cfq_find_best_entry = {
-	.attr = {.name = "find_best_crq", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_find_best_show,
-	.store = cfq_find_best_store,
+	.show = cfq_fifo_expire_async_show,
+	.store = cfq_fifo_expire_async_store,
 };
 static struct cfq_fs_entry cfq_back_max_entry = {
 	.attr = {.name = "back_seek_max", .mode = S_IRUGO | S_IWUSR },
@@ -1742,27 +2343,43 @@ static struct cfq_fs_entry cfq_back_penalty_entry = {
 	.show = cfq_back_penalty_show,
 	.store = cfq_back_penalty_store,
 };
-static struct cfq_fs_entry cfq_clear_elapsed_entry = {
-	.attr = {.name = "clear_elapsed", .mode = S_IWUSR },
-	.store = cfq_clear_elapsed,
+static struct cfq_fs_entry cfq_slice_sync_entry = {
+	.attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_sync_show,
+	.store = cfq_slice_sync_store,
 };
-static struct cfq_fs_entry cfq_key_type_entry = {
-	.attr = {.name = "key_type", .mode = S_IRUGO | S_IWUSR },
-	.show = cfq_read_key_type,
-	.store = cfq_set_key_type,
+static struct cfq_fs_entry cfq_slice_async_entry = {
+	.attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_async_show,
+	.store = cfq_slice_async_store,
+};
+static struct cfq_fs_entry cfq_slice_async_rq_entry = {
+	.attr = {.name = "slice_async_rq", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_async_rq_show,
+	.store = cfq_slice_async_rq_store,
+};
+static struct cfq_fs_entry cfq_slice_idle_entry = {
+	.attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_slice_idle_show,
+	.store = cfq_slice_idle_store,
+};
+static struct cfq_fs_entry cfq_max_depth_entry = {
+	.attr = {.name = "max_depth", .mode = S_IRUGO | S_IWUSR },
+	.show = cfq_max_depth_show,
+	.store = cfq_max_depth_store,
 };
-
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
-	&cfq_fifo_expire_r_entry.attr,
-	&cfq_fifo_expire_w_entry.attr,
-	&cfq_fifo_batch_expire_entry.attr,
-	&cfq_key_type_entry.attr,
-	&cfq_find_best_entry.attr,
+	&cfq_fifo_expire_sync_entry.attr,
+	&cfq_fifo_expire_async_entry.attr,
 	&cfq_back_max_entry.attr,
 	&cfq_back_penalty_entry.attr,
-	&cfq_clear_elapsed_entry.attr,
+	&cfq_slice_sync_entry.attr,
+	&cfq_slice_async_entry.attr,
+	&cfq_slice_async_rq_entry.attr,
+	&cfq_slice_idle_entry.attr,
+	&cfq_max_depth_entry.attr,
 	NULL,
 };
 
@@ -1832,21 +2449,46 @@ static int __init cfq_init(void)
 {
 	int ret;
 
+	/*
+	 * could be 0 on HZ < 1000 setups
+	 */
+	if (!cfq_slice_async)
+		cfq_slice_async = 1;
+	if (!cfq_slice_idle)
+		cfq_slice_idle = 1;
+
 	if (cfq_slab_setup())
 		return -ENOMEM;
 
 	ret = elv_register(&iosched_cfq);
-	if (!ret) {
-		__module_get(THIS_MODULE);
-		return 0;
-	}
+	if (ret)
+		cfq_slab_kill();
 
-	cfq_slab_kill();
 	return ret;
 }
 
 static void __exit cfq_exit(void)
 {
+	struct task_struct *g, *p;
+	unsigned long flags;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	/*
+	 * iterate each process in the system, removing our io_context
+	 */
+	do_each_thread(g, p) {
+		struct io_context *ioc = p->io_context;
+
+		if (ioc && ioc->cic) {
+			ioc->cic->exit(ioc->cic);
+			cfq_free_io_context(ioc->cic);
+			ioc->cic = NULL;
+		}
+	} while_each_thread(g, p);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+
 	cfq_slab_kill();
 	elv_unregister(&iosched_cfq);
 }
diff --git a/drivers/block/deadline-iosched.c b/drivers/block/deadline-iosched.c
index 4bc2fea73273..ff5201e02153 100644
--- a/drivers/block/deadline-iosched.c
+++ b/drivers/block/deadline-iosched.c
@@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
 }
 
 static int
-deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		     int gfp_mask)
 {
 	struct deadline_data *dd = q->elevator->elevator_data;
 	struct deadline_rq *drq;
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index f831f08f839c..98f0126a2deb 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
 	return NULL;
 }
 
-int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
+		    int gfp_mask)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
+		return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
 
 	rq->elevator_private = NULL;
 	return 0;
@@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
 		e->ops->elevator_put_req_fn(q, rq);
 }
 
-int elv_may_queue(request_queue_t *q, int rw)
+int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
 {
 	elevator_t *e = q->elevator;
 
 	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw);
+		return e->ops->elevator_may_queue_fn(q, rw, bio);
 
 	return ELV_MQUEUE_MAY;
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 60e64091de1b..234fdcfbdf01 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
 	rq->errors = 0;
 	rq->rq_status = RQ_ACTIVE;
 	rq->bio = rq->biotail = NULL;
+	rq->ioprio = 0;
 	rq->buffer = NULL;
 	rq->ref_count = 1;
 	rq->q = q;
@@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
 	if (!blk_remove_plug(q))
 		return;
 
-	/*
-	 * was plugged, fire request_fn if queue has stuff to do
-	 */
-	if (elv_next_request(q))
-		q->request_fn(q);
+	q->request_fn(q);
 }
 EXPORT_SYMBOL(__generic_unplug_device);
 
@@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
 	mempool_free(rq, q->rq.rq_pool);
 }
 
-static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
-						int gfp_mask)
+static inline struct request *
+blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 
@@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
 	 */
 	rq->flags = rw;
 
-	if (!elv_set_request(q, rq, gfp_mask))
+	if (!elv_set_request(q, rq, bio, gfp_mask))
 		return rq;
 
 	mempool_free(rq, q->rq.rq_pool);
@@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
 /*
  * Get a free request, queue_lock must not be held
  */
-static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
+static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
+				   int gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
@@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
 		}
 	}
 
-	switch (elv_may_queue(q, rw)) {
+	switch (elv_may_queue(q, rw, bio)) {
 		case ELV_MQUEUE_NO:
 			goto rq_starved;
 		case ELV_MQUEUE_MAY:
@@ -1920,7 +1918,7 @@ get_rq:
 		set_queue_congested(q, rw);
 	spin_unlock_irq(q->queue_lock);
 
-	rq = blk_alloc_request(q, rw, gfp_mask);
+	rq = blk_alloc_request(q, rw, bio, gfp_mask);
 	if (!rq) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -1961,7 +1959,8 @@ out:
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  */
-static struct request *get_request_wait(request_queue_t *q, int rw)
+static struct request *get_request_wait(request_queue_t *q, int rw,
+					struct bio *bio)
 {
 	DEFINE_WAIT(wait);
 	struct request *rq;
@@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 
-		rq = get_request(q, rw, GFP_NOIO);
+		rq = get_request(q, rw, bio, GFP_NOIO);
 
 		if (!rq) {
 			struct io_context *ioc;
@@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
 	BUG_ON(rw != READ && rw != WRITE);
 
 	if (gfp_mask & __GFP_WAIT)
-		rq = get_request_wait(q, rw);
+		rq = get_request_wait(q, rw, NULL);
 	else
-		rq = get_request(q, rw, gfp_mask);
+		rq = get_request(q, rw, NULL, gfp_mask);
 
 	return rq;
 }
@@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
 		return;
 
 	req->rq_status = RQ_INACTIVE;
-	req->q = NULL;
 	req->rl = NULL;
 
 	/*
@@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
 		req->rq_disk->in_flight--;
 	}
 
+	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
+
 	__blk_put_request(q, next);
 	return 1;
 }
@@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 {
 	struct request *req, *freereq = NULL;
 	int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
+	unsigned short prio;
 	sector_t sector;
 
 	sector = bio->bi_sector;
 	nr_sectors = bio_sectors(bio);
 	cur_nr_sectors = bio_cur_sectors(bio);
+	prio = bio_prio(bio);
 
 	rw = bio_data_dir(bio);
 	sync = bio_sync(bio);
@@ -2559,6 +2561,7 @@ again:
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req);
@@ -2583,6 +2586,7 @@ again:
 			req->hard_cur_sectors = cur_nr_sectors;
 			req->sector = req->hard_sector = sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req);
@@ -2610,7 +2614,7 @@ get_rq:
 		freereq = NULL;
 	} else {
 		spin_unlock_irq(q->queue_lock);
-		if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
+		if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
 			/*
 			 * READA bit set
 			 */
@@ -2618,7 +2622,7 @@ get_rq:
 			if (bio_rw_ahead(bio))
 				goto end_io;
 	
-			freereq = get_request_wait(q, rw);
+			freereq = get_request_wait(q, rw, bio);
 		}
 		goto again;
 	}
@@ -2646,6 +2650,7 @@ get_rq:
 	req->buffer = bio_data(bio);	/* see ->buffer comment above */
 	req->waiting = NULL;
 	req->bio = req->biotail = bio;
+	req->ioprio = prio;
 	req->rq_disk = bio->bi_bdev->bd_disk;
 	req->start_time = jiffies;
 
@@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
 	if (bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 
-		switch (bio->bi_rw) {
+		switch (bio_data_dir(bio)) {
 		case READ:
 			p->read_sectors += bio_sectors(bio);
 			p->reads++;
@@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
 {
 	struct request_list *rl = &q->rq;
 	struct request *rq;
+	int requeued = 0;
 
 	spin_lock_irq(q->queue_lock);
 	clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
@@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
 		rq = list_entry_rq(q->drain_list.next);
 
 		list_del_init(&rq->queuelist);
-		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
+		elv_requeue_request(q, rq);
+		requeued++;
 	}
 
+	if (requeued)
+		q->request_fn(q);
+
 	spin_unlock_irq(q->queue_lock);
 
 	wake_up(&rl->wait[0]);
@@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)
 
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
-	bio->bi_rw = rw;
+	bio->bi_rw |= rw;
 	if (rw & WRITE)
 		mod_page_state(pgpgout, count);
 	else
@@ -3257,8 +3267,11 @@ void exit_io_context(void)
 	struct io_context *ioc;
 
 	local_irq_save(flags);
+	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
+	ioc->task = NULL;
+	task_unlock(current);
 	local_irq_restore(flags);
 
 	if (ioc->aic && ioc->aic->exit)
@@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
 	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
-		ret->pid = tsk->pid;
+		ret->task = current;
+		ret->set_ioprio = NULL;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic = NULL;
-		spin_lock_init(&ret->lock);
 
 		local_irq_save(flags);
 
diff --git a/fs/Makefile b/fs/Makefile
index fc92e59e9faf..20edcf28bfd2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,6 +10,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
+		ioprio.o
 
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o
diff --git a/fs/ioprio.c b/fs/ioprio.c
new file mode 100644
index 000000000000..663e420636d6
--- /dev/null
+++ b/fs/ioprio.c
@@ -0,0 +1,172 @@
+/*
+ * fs/ioprio.c
+ *
+ * Copyright (C) 2004 Jens Axboe <axboe@suse.de>
+ *
+ * Helper functions for setting/querying io priorities of processes. The
+ * system calls closely mimmick getpriority/setpriority, see the man page for
+ * those. The prio argument is a composite of prio class and prio data, where
+ * the data argument has meaning within that class. The standard scheduling
+ * classes have 8 distinct prio levels, with 0 being the highest prio and 7
+ * being the lowest.
+ *
+ * IOW, setting BE scheduling class with prio 2 is done ala:
+ *
+ * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
+ *
+ * ioprio_set(PRIO_PROCESS, pid, prio);
+ *
+ * See also Documentation/block/ioprio.txt
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ioprio.h>
+#include <linux/blkdev.h>
+
+static int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+	struct io_context *ioc;
+
+	if (task->uid != current->euid &&
+	    task->uid != current->uid && !capable(CAP_SYS_NICE))
+		return -EPERM;
+
+	task_lock(task);
+
+	task->ioprio = ioprio;
+
+	ioc = task->io_context;
+	if (ioc && ioc->set_ioprio)
+		ioc->set_ioprio(ioc, ioprio);
+
+	task_unlock(task);
+	return 0;
+}
+
+asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
+{
+	int class = IOPRIO_PRIO_CLASS(ioprio);
+	int data = IOPRIO_PRIO_DATA(ioprio);
+	struct task_struct *p, *g;
+	struct user_struct *user;
+	int ret;
+
+	switch (class) {
+		case IOPRIO_CLASS_RT:
+			if (!capable(CAP_SYS_ADMIN))
+				return -EPERM;
+			/* fall through, rt has prio field too */
+		case IOPRIO_CLASS_BE:
+			if (data >= IOPRIO_BE_NR || data < 0)
+				return -EINVAL;
+
+			break;
+		case IOPRIO_CLASS_IDLE:
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	ret = -ESRCH;
+	read_lock_irq(&tasklist_lock);
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_pid(who);
+			if (p)
+				ret = set_task_ioprio(p, ioprio);
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current->user;
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (p->uid != who)
+					continue;
+				ret = set_task_ioprio(p, ioprio);
+				if (ret)
+					break;
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	read_unlock_irq(&tasklist_lock);
+	return ret;
+}
+
+asmlinkage int sys_ioprio_get(int which, int who)
+{
+	struct task_struct *g, *p;
+	struct user_struct *user;
+	int ret = -ESRCH;
+
+	read_lock_irq(&tasklist_lock);
+	switch (which) {
+		case IOPRIO_WHO_PROCESS:
+			if (!who)
+				p = current;
+			else
+				p = find_task_by_pid(who);
+			if (p)
+				ret = p->ioprio;
+			break;
+		case IOPRIO_WHO_PGRP:
+			if (!who)
+				who = process_group(current);
+			do_each_task_pid(who, PIDTYPE_PGID, p) {
+				if (ret == -ESRCH)
+					ret = p->ioprio;
+				else
+					ret = ioprio_best(ret, p->ioprio);
+			} while_each_task_pid(who, PIDTYPE_PGID, p);
+			break;
+		case IOPRIO_WHO_USER:
+			if (!who)
+				user = current->user;
+			else
+				user = find_user(who);
+
+			if (!user)
+				break;
+
+			do_each_thread(g, p) {
+				if (p->uid != user->uid)
+					continue;
+				if (ret == -ESRCH)
+					ret = p->ioprio;
+				else
+					ret = ioprio_best(ret, p->ioprio);
+			} while_each_thread(g, p);
+
+			if (who)
+				free_uid(user);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	read_unlock_irq(&tasklist_lock);
+	return ret;
+}
+
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 7b87707acc36..d1bcf0da6728 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -645,18 +645,22 @@ struct buffer_chunk {
 
 static void write_chunk(struct buffer_chunk *chunk) {
     int i;
+    get_fs_excl();
     for (i = 0; i < chunk->nr ; i++) {
 	submit_logged_buffer(chunk->bh[i]) ;
     }
     chunk->nr = 0;
+    put_fs_excl();
 }
 
 static void write_ordered_chunk(struct buffer_chunk *chunk) {
     int i;
+    get_fs_excl();
     for (i = 0; i < chunk->nr ; i++) {
 	submit_ordered_buffer(chunk->bh[i]) ;
     }
     chunk->nr = 0;
+    put_fs_excl();
 }
 
 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
     return 0 ;
   }
 
+  get_fs_excl();
+
   /* before we can put our commit blocks on disk, we have to make sure everyone older than
   ** us is on disk too
   */
@@ -1055,6 +1061,7 @@ put_jl:
 
   if (retval)
     reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
+  put_fs_excl();
   return retval;
 }
 
@@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
     return 0 ;
   }
 
+  get_fs_excl();
+
   /* if all the work is already done, get out of here */
   if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
       atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -1450,6 +1459,7 @@ flush_older_and_return:
   put_journal_list(s, jl);
   if (flushall)
     up(&journal->j_flush_sem);
+  put_fs_excl();
   return err ;
 } 
 
@@ -2719,6 +2729,7 @@ relock:
   th->t_trans_id = journal->j_trans_id ;
   unlock_journal(p_s_sb) ;
   INIT_LIST_HEAD (&th->t_list);
+  get_fs_excl();
   return 0 ;
 
 out_fail:
@@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   BUG_ON (th->t_refcount > 1);
   BUG_ON (!th->t_trans_id);
 
+  put_fs_excl();
   current->journal_info = th->t_handle_save;
   reiserfs_check_lock_depth(p_s_sb, "journal end");
   if (journal->j_len == 0) {
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 176413fb9ae3..e25e4c71a879 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -294,8 +294,10 @@
 #define __NR_add_key		286
 #define __NR_request_key	287
 #define __NR_keyctl		288
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
 
-#define NR_syscalls 289
+#define NR_syscalls 291
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index f7f43ec2483a..517f1649ee64 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -263,6 +263,8 @@
 #define __NR_add_key			1271
 #define __NR_request_key		1272
 #define __NR_keyctl			1273
+#define __NR_ioprio_set			1274
+#define __NR_ioprio_get			1275
 #define __NR_set_zone_reclaim		1276
 
 #ifdef __KERNEL__
diff --git a/include/asm-ppc/unistd.h b/include/asm-ppc/unistd.h
index cc51e5c9acc2..e8b79220b29c 100644
--- a/include/asm-ppc/unistd.h
+++ b/include/asm-ppc/unistd.h
@@ -277,8 +277,10 @@
 #define __NR_request_key	270
 #define __NR_keyctl		271
 #define __NR_waitid		272
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274
 
-#define __NR_syscalls		273
+#define __NR_syscalls		275
 
 #define __NR(n)	#n
 
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index d767adcbf0ff..6560439a83e4 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
 __SYSCALL(__NR_request_key, sys_request_key)
 #define __NR_keyctl		250
 __SYSCALL(__NR_keyctl, sys_keyctl)
+#define __NR_ioprio_set		251
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get		252
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
 
-#define __NR_syscall_max __NR_keyctl
+#define __NR_syscall_max __NR_ioprio_get
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 038022763f09..36ef29fa0d8b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -22,6 +22,7 @@
 
 #include <linux/highmem.h>
 #include <linux/mempool.h>
+#include <linux/ioprio.h>
 
 /* Platforms may set this to teach the BIO layer about IOMMU hardware. */
 #include <asm/io.h>
@@ -149,6 +150,19 @@ struct bio {
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 
+/*
+ * upper 16 bits of bi_rw define the io priority of this bio
+ */
+#define BIO_PRIO_SHIFT	(8 * sizeof(unsigned long) - IOPRIO_BITS)
+#define bio_prio(bio)	((bio)->bi_rw >> BIO_PRIO_SHIFT)
+#define bio_prio_valid(bio)	ioprio_valid(bio_prio(bio))
+
+#define bio_set_prio(bio, prio)		do {			\
+	WARN_ON(prio >= (1 << IOPRIO_BITS));			\
+	(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1);		\
+	(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT);	\
+} while (0)
+
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b54a0348a890..21a8674cd149 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -54,16 +54,23 @@ struct as_io_context {
 
 struct cfq_queue;
 struct cfq_io_context {
-	void (*dtor)(struct cfq_io_context *);
-	void (*exit)(struct cfq_io_context *);
-
-	struct io_context *ioc;
-
 	/*
 	 * circular list of cfq_io_contexts belonging to a process io context
 	 */
 	struct list_head list;
 	struct cfq_queue *cfqq;
+	void *key;
+
+	struct io_context *ioc;
+
+	unsigned long last_end_request;
+	unsigned long last_queue;
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+
+	void (*dtor)(struct cfq_io_context *);
+	void (*exit)(struct cfq_io_context *);
 };
 
 /*
@@ -73,7 +80,9 @@ struct cfq_io_context {
  */
 struct io_context {
 	atomic_t refcount;
-	pid_t pid;
+	struct task_struct *task;
+
+	int (*set_ioprio)(struct io_context *, unsigned int);
 
 	/*
 	 * For request batching
@@ -81,8 +90,6 @@ struct io_context {
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 
-	spinlock_t lock;
-
 	struct as_io_context *aic;
 	struct cfq_io_context *cic;
 };
@@ -134,6 +141,8 @@ struct request {
 
 	void *elevator_private;
 
+	unsigned short ioprio;
+
 	int rq_status;	/* should split this into a few status bits */
 	struct gendisk *rq_disk;
 	int errors;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ee54f81faad5..ea6bbc2d7407 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
 typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
 typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
-typedef int (elevator_may_queue_fn) (request_queue_t *, int);
+typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
 
-typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
+typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
 typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
 typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
 
@@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
 extern struct request *elv_latter_request(request_queue_t *, struct request *);
 extern int elv_register_queue(request_queue_t *q);
 extern void elv_unregister_queue(request_queue_t *q);
-extern int elv_may_queue(request_queue_t *, int);
+extern int elv_may_queue(request_queue_t *, int, struct bio *);
 extern void elv_completed_request(request_queue_t *, struct request *);
-extern int elv_set_request(request_queue_t *, struct request *, int);
+extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
 extern void elv_put_request(request_queue_t *, struct request *);
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3ae8e37bdfc8..047bde30836a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -213,6 +213,7 @@ extern int dir_notify_enable;
 #include <linux/radix-tree.h>
 #include <linux/prio_tree.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
@@ -822,16 +823,34 @@ enum {
 #define vfs_check_frozen(sb, level) \
 	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
 
+static inline void get_fs_excl(void)
+{
+	atomic_inc(&current->fs_excl);
+}
+
+static inline void put_fs_excl(void)
+{
+	atomic_dec(&current->fs_excl);
+}
+
+static inline int has_fs_excl(void)
+{
+	return atomic_read(&current->fs_excl);
+}
+
+
 /*
  * Superblock locking.
  */
 static inline void lock_super(struct super_block * sb)
 {
+	get_fs_excl();
 	down(&sb->s_lock);
 }
 
 static inline void unlock_super(struct super_block * sb)
 {
+	put_fs_excl();
 	up(&sb->s_lock);
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 03206a425d7a..c727c195a91a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -81,6 +81,7 @@ extern struct group_info init_groups;
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.ioprio		= 0,						\
 	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
@@ -110,6 +111,7 @@ extern struct group_info init_groups;
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.fs_excl	= ATOMIC_INIT(0),				\
 }
 
 
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
new file mode 100644
index 000000000000..7811300d88ee
--- /dev/null
+++ b/include/linux/ioprio.h
@@ -0,0 +1,87 @@
+#ifndef IOPRIO_H
+#define IOPRIO_H
+
+#include <linux/sched.h>
+
+/*
+ * Gives us 8 prio classes with 13-bits of data for each class
+ */
+#define IOPRIO_BITS		(16)
+#define IOPRIO_CLASS_SHIFT	(13)
+#define IOPRIO_PRIO_MASK	((1UL << IOPRIO_CLASS_SHIFT) - 1)
+
+#define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
+#define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
+
+#define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
+
+/*
+ * These are the io priority groups as implemented by CFQ. RT is the realtime
+ * class, it always gets premium service. BE is the best-effort scheduling
+ * class, the default for any process. IDLE is the idle scheduling class, it
+ * is only served when no one else is using the disk.
+ */
+enum {
+	IOPRIO_CLASS_NONE,
+	IOPRIO_CLASS_RT,
+	IOPRIO_CLASS_BE,
+	IOPRIO_CLASS_IDLE,
+};
+
+/*
+ * 8 best effort priority levels are supported
+ */
+#define IOPRIO_BE_NR	(8)
+
+asmlinkage int sys_ioprio_set(int, int, int);
+asmlinkage int sys_ioprio_get(int, int);
+
+enum {
+	IOPRIO_WHO_PROCESS = 1,
+	IOPRIO_WHO_PGRP,
+	IOPRIO_WHO_USER,
+};
+
+/*
+ * if process has set io priority explicitly, use that. if not, convert
+ * the cpu scheduler nice value to an io priority
+ */
+#define IOPRIO_NORM	(4)
+static inline int task_ioprio(struct task_struct *task)
+{
+	WARN_ON(!ioprio_valid(task->ioprio));
+	return IOPRIO_PRIO_DATA(task->ioprio);
+}
+
+static inline int task_nice_ioprio(struct task_struct *task)
+{
+	return (task_nice(task) + 20) / 5;
+}
+
+/*
+ * For inheritance, return the highest of the two given priorities
+ */
+static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
+{
+	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
+	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+
+	if (!ioprio_valid(aprio))
+		return bprio;
+	if (!ioprio_valid(bprio))
+		return aprio;
+
+	if (aclass == IOPRIO_CLASS_NONE)
+		aclass = IOPRIO_CLASS_BE;
+	if (bclass == IOPRIO_CLASS_NONE)
+		bclass = IOPRIO_CLASS_BE;
+
+	if (aclass == bclass)
+		return min(aprio, bprio);
+	if (aclass > bclass)
+		return bprio;
+	else
+		return aprio;
+}
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9530b1903160..ff48815bd3a2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -608,6 +608,8 @@ struct task_struct {
 	struct list_head run_list;
 	prio_array_t *array;
 
+	unsigned short ioprio;
+
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
@@ -763,6 +765,7 @@ struct task_struct {
 	nodemask_t mems_allowed;
 	int cpuset_mems_generation;
 #endif
+	atomic_t fs_excl;	/* holding fs exclusive resources */
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);
 
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
- * subscriptions and synchronises with wait4().  Also used in procfs.
+ * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+ * pins the final release of task.io_context.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1262cb43c3ab..d5c3fe1bf33d 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -14,11 +14,13 @@ extern struct list_head inode_unused;
  * Yes, writeback.h requires sched.h
  * No, sched.h is not included from here.
  */
-static inline int current_is_pdflush(void)
+static inline int task_is_pdflush(struct task_struct *task)
 {
-	return current->flags & PF_FLUSHER;
+	return task->flags & PF_FLUSHER;
 }
 
+#define current_is_pdflush()	task_is_pdflush(current)
+
 /*
  * fs/fs-writeback.c
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3ebcd60a19c6..9d1b10ed0135 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	profile_task_exit(tsk);
 
+	WARN_ON(atomic_read(&tsk->fs_excl));
+
 	if (unlikely(in_interrupt()))
 		panic("Aiee, killing interrupt handler!");
 	if (unlikely(!tsk->pid))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c7806873bfd..cdef6cea8900 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 	}
 
+	/*
+	 * inherit ioprio
+	 */
+	p->ioprio = current->ioprio;
+
 	SET_LINKS(p);
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
diff --git a/kernel/sched.c b/kernel/sched.c
index a07cff90d849..e2b0d3e4dd06 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3448,15 +3448,7 @@ int task_nice(const task_t *p)
 {
 	return TASK_NICE(p);
 }
-
-/*
- * The only users of task_nice are binfmt_elf and binfmt_elf32.
- * binfmt_elf is no longer modular, but binfmt_elf32 still is.
- * Therefore, task_nice is needed if there is a compat_mode.
- */
-#ifdef CONFIG_COMPAT
 EXPORT_SYMBOL_GPL(task_nice);
-#endif
 
 /**
  * idle_cpu - is a given cpu idle currently?
-- 
cgit v1.2.3-59-g8ed1b


From 3b18152c327707ae6a2eeba4cfb66457143753bc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Mon, 27 Jun 2005 10:56:24 +0200
Subject: [PATCH] CFQ io scheduler updates

- Adjust slice values

- Instead of one async queue, one is defined per priority level. This
  prevents kernel threads (such as reiserfs/x and others) that run at
  higher io priority from conflicting with others. Previously, it was a
  coin toss what io prio the async queue got, it was defined by who
  first set up the queue.

- Let a time slice only begin, when the previous slice is completely
  done. Previously we could be somewhat unfair to a new sync slice, if
  the previous slice was async and had several ios queued. This might
  need a little tweaking if throughput suffers a little due to this,
  allowing perhaps an overlap of a single request or so.

- Optimize the calling of kblockd_schedule_work() by doing it only when
  it is strictly necessary (no requests in driver and work left to do).

- Correct sync vs async logic. A 'normal' process can be purely async as
  well, and a flusher can be purely sync as well. Sync or async is now a
  property of the class defined and requests pending. Previously writers
  could be considered sync, when they were really async.

- Get rid of the bit fields in cfqq and crq, use flags instead.

- Various other cleanups and fixes

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cfq-iosched.c | 460 ++++++++++++++++++++++++++++----------------
 include/linux/ioprio.h      |   1 +
 2 files changed, 300 insertions(+), 161 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
index baa3e268250a..1ecb179b8604 100644
--- a/drivers/block/cfq-iosched.c
+++ b/drivers/block/cfq-iosched.c
@@ -34,14 +34,15 @@ static int cfq_back_max = 16 * 1024;	/* maximum backwards seek, in KiB */
 static int cfq_back_penalty = 2;	/* penalty of a backwards seek */
 
 static int cfq_slice_sync = HZ / 10;
-static int cfq_slice_async = HZ / 50;
+static int cfq_slice_async = HZ / 25;
 static int cfq_slice_async_rq = 2;
-static int cfq_slice_idle = HZ / 50;
+static int cfq_slice_idle = HZ / 100;
 
 #define CFQ_IDLE_GRACE		(HZ / 10)
 #define CFQ_SLICE_SCALE		(5)
 
 #define CFQ_KEY_ASYNC		(0)
+#define CFQ_KEY_ANY		(0xffff)
 
 /*
  * disable queueing at the driver/hardware level
@@ -96,7 +97,16 @@ static kmem_cache_t *cfq_ioc_pool;
 #define cfq_class_be(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_BE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 
-#define cfq_cfqq_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+#define ASYNC			(0)
+#define SYNC			(1)
+
+#define cfq_cfqq_dispatched(cfqq)	\
+	((cfqq)->on_dispatch[ASYNC] + (cfqq)->on_dispatch[SYNC])
+
+#define cfq_cfqq_class_sync(cfqq)	((cfqq)->key != CFQ_KEY_ASYNC)
+
+#define cfq_cfqq_sync(cfqq)		\
+	(cfq_cfqq_class_sync(cfqq) || (cfqq)->on_dispatch[SYNC])
 
 /*
  * Per block device queue structure
@@ -200,28 +210,15 @@ struct cfq_queue {
 	unsigned long slice_left;
 	unsigned long service_last;
 
-	/* number of requests that have been handed to the driver */
-	int in_flight;
+	/* number of requests that are on the dispatch list */
+	int on_dispatch[2];
 
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 
-	/* whether queue is on rr (or empty) list */
-	unsigned on_rr : 1;
-	/* idle slice, waiting for new request submission */
-	unsigned wait_request : 1;
-	/* set when wait_request gets set, reset on first rq alloc */
-	unsigned must_alloc : 1;
-	/* only gets one must_alloc per slice */
-	unsigned must_alloc_slice : 1;
-	/* idle slice, request added, now waiting to dispatch it */
-	unsigned must_dispatch : 1;
-	/* fifo expire per-slice */
-	unsigned fifo_expire : 1;
-
-	unsigned idle_window : 1;
-	unsigned prio_changed : 1;
+	/* various state flags, see below */
+	unsigned int flags;
 };
 
 struct cfq_rq {
@@ -233,15 +230,77 @@ struct cfq_rq {
 	struct cfq_queue *cfq_queue;
 	struct cfq_io_context *io_context;
 
-	unsigned in_flight : 1;
-	unsigned accounted : 1;
-	unsigned is_sync   : 1;
-	unsigned requeued  : 1;
+	unsigned int crq_flags;
 };
 
-static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int);
+enum cfqq_state_flags {
+	CFQ_CFQQ_FLAG_on_rr = 0,
+	CFQ_CFQQ_FLAG_wait_request,
+	CFQ_CFQQ_FLAG_must_alloc,
+	CFQ_CFQQ_FLAG_must_alloc_slice,
+	CFQ_CFQQ_FLAG_must_dispatch,
+	CFQ_CFQQ_FLAG_fifo_expire,
+	CFQ_CFQQ_FLAG_idle_window,
+	CFQ_CFQQ_FLAG_prio_changed,
+	CFQ_CFQQ_FLAG_expired,
+};
+
+#define CFQ_CFQQ_FNS(name)						\
+static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
+{									\
+	cfqq->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
+}									\
+static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
+{									\
+	cfqq->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
+}									\
+static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
+{									\
+	return (cfqq->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
+}
+
+CFQ_CFQQ_FNS(on_rr);
+CFQ_CFQQ_FNS(wait_request);
+CFQ_CFQQ_FNS(must_alloc);
+CFQ_CFQQ_FNS(must_alloc_slice);
+CFQ_CFQQ_FNS(must_dispatch);
+CFQ_CFQQ_FNS(fifo_expire);
+CFQ_CFQQ_FNS(idle_window);
+CFQ_CFQQ_FNS(prio_changed);
+CFQ_CFQQ_FNS(expired);
+#undef CFQ_CFQQ_FNS
+
+enum cfq_rq_state_flags {
+	CFQ_CRQ_FLAG_in_flight = 0,
+	CFQ_CRQ_FLAG_in_driver,
+	CFQ_CRQ_FLAG_is_sync,
+	CFQ_CRQ_FLAG_requeued,
+};
+
+#define CFQ_CRQ_FNS(name)						\
+static inline void cfq_mark_crq_##name(struct cfq_rq *crq)		\
+{									\
+	crq->crq_flags |= (1 << CFQ_CRQ_FLAG_##name);			\
+}									\
+static inline void cfq_clear_crq_##name(struct cfq_rq *crq)		\
+{									\
+	crq->crq_flags &= ~(1 << CFQ_CRQ_FLAG_##name);			\
+}									\
+static inline int cfq_crq_##name(const struct cfq_rq *crq)		\
+{									\
+	return (crq->crq_flags & (1 << CFQ_CRQ_FLAG_##name)) != 0;	\
+}
+
+CFQ_CRQ_FNS(in_flight);
+CFQ_CRQ_FNS(in_driver);
+CFQ_CRQ_FNS(is_sync);
+CFQ_CRQ_FNS(requeued);
+#undef CFQ_CRQ_FNS
+
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *, unsigned int, unsigned short);
 static void cfq_dispatch_sort(request_queue_t *, struct cfq_rq *);
 static void cfq_put_cfqd(struct cfq_data *cfqd);
+static inline int cfq_pending_requests(struct cfq_data *cfqd);
 
 #define process_sync(tsk)	((tsk)->flags & PF_SYNCWRITE)
 
@@ -305,9 +364,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct cfq_rq *crq1, struct cfq_rq *crq2)
 		return crq2;
 	if (crq2 == NULL)
 		return crq1;
-	if (crq1->requeued)
+	if (cfq_crq_requeued(crq1))
 		return crq1;
-	if (crq2->requeued)
+	if (cfq_crq_requeued(crq2))
 		return crq2;
 
 	s1 = crq1->request->sector;
@@ -407,7 +466,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct list_head *list, *entry;
 
-	BUG_ON(!cfqq->on_rr);
+	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 
 	list_del(&cfqq->cfq_list);
 
@@ -423,7 +482,7 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 		 * has lots of io pending vs one that only generates one
 		 * sporadically or synchronously
 		 */
-		if (cfqq->in_flight)
+		if (cfq_cfqq_dispatched(cfqq))
 			list = &cfqd->busy_rr;
 		else
 			list = &cfqd->rr_list[cfqq->ioprio];
@@ -461,8 +520,8 @@ static void cfq_resort_rr_list(struct cfq_queue *cfqq, int preempted)
 static inline void
 cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 {
-	BUG_ON(cfqq->on_rr);
-	cfqq->on_rr = 1;
+	BUG_ON(cfq_cfqq_on_rr(cfqq));
+	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 
 	cfq_resort_rr_list(cfqq, requeue);
@@ -471,8 +530,8 @@ cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq, int requeue)
 static inline void
 cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	BUG_ON(!cfqq->on_rr);
-	cfqq->on_rr = 0;
+	BUG_ON(!cfq_cfqq_on_rr(cfqq));
+	cfq_clear_cfqq_on_rr(cfqq);
 	list_move(&cfqq->cfq_list, &cfqd->empty_list);
 
 	BUG_ON(!cfqd->busy_queues);
@@ -488,7 +547,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 
 	if (ON_RB(&crq->rb_node)) {
 		struct cfq_data *cfqd = cfqq->cfqd;
-		const int sync = crq->is_sync;
+		const int sync = cfq_crq_is_sync(crq);
 
 		BUG_ON(!cfqq->queued[sync]);
 		cfqq->queued[sync]--;
@@ -498,7 +557,7 @@ static inline void cfq_del_crq_rb(struct cfq_rq *crq)
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
 		RB_CLEAR_COLOR(&crq->rb_node);
 
-		if (cfqq->on_rr && RB_EMPTY(&cfqq->sort_list))
+		if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY(&cfqq->sort_list))
 			cfq_del_cfqq_rr(cfqd, cfqq);
 	}
 }
@@ -534,7 +593,7 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 	struct cfq_rq *__alias;
 
 	crq->rb_key = rq_rb_key(rq);
-	cfqq->queued[crq->is_sync]++;
+	cfqq->queued[cfq_crq_is_sync(crq)]++;
 
 	/*
 	 * looks a little odd, but the first insert might return an alias.
@@ -545,8 +604,8 @@ static void cfq_add_crq_rb(struct cfq_rq *crq)
 
 	rb_insert_color(&crq->rb_node, &cfqq->sort_list);
 
-	if (!cfqq->on_rr)
-		cfq_add_cfqq_rr(cfqd, cfqq, crq->requeued);
+	if (!cfq_cfqq_on_rr(cfqq))
+		cfq_add_cfqq_rr(cfqd, cfqq, cfq_crq_requeued(crq));
 
 	/*
 	 * check if this request is a better next-serve candidate
@@ -559,7 +618,7 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 {
 	if (ON_RB(&crq->rb_node)) {
 		rb_erase(&crq->rb_node, &cfqq->sort_list);
-		cfqq->queued[crq->is_sync]--;
+		cfqq->queued[cfq_crq_is_sync(crq)]--;
 	}
 
 	cfq_add_crq_rb(crq);
@@ -568,7 +627,7 @@ cfq_reposition_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
 static struct request *cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
 
 {
-	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid);
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->pid, CFQ_KEY_ANY);
 	struct rb_node *n;
 
 	if (!cfqq)
@@ -598,17 +657,19 @@ static void cfq_deactivate_request(request_queue_t *q, struct request *rq)
 	if (crq) {
 		struct cfq_queue *cfqq = crq->cfq_queue;
 
-		if (crq->accounted) {
-			crq->accounted = 0;
+		if (cfq_crq_in_driver(crq)) {
+			cfq_clear_crq_in_driver(crq);
 			WARN_ON(!cfqd->rq_in_driver);
 			cfqd->rq_in_driver--;
 		}
-		if (crq->in_flight) {
-			crq->in_flight = 0;
-			WARN_ON(!cfqq->in_flight);
-			cfqq->in_flight--;
+		if (cfq_crq_in_flight(crq)) {
+			const int sync = cfq_crq_is_sync(crq);
+
+			cfq_clear_crq_in_flight(crq);
+			WARN_ON(!cfqq->on_dispatch[sync]);
+			cfqq->on_dispatch[sync]--;
 		}
-		crq->requeued = 1;
+		cfq_mark_crq_requeued(crq);
 	}
 }
 
@@ -712,8 +773,9 @@ __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->slice_start = jiffies;
 		cfqq->slice_end = 0;
 		cfqq->slice_left = 0;
-		cfqq->must_alloc_slice = 0;
-		cfqq->fifo_expire = 0;
+		cfq_clear_cfqq_must_alloc_slice(cfqq);
+		cfq_clear_cfqq_fifo_expire(cfqq);
+		cfq_clear_cfqq_expired(cfqq);
 	}
 
 	cfqd->active_queue = cfqq;
@@ -776,9 +838,18 @@ static int cfq_get_next_prio_level(struct cfq_data *cfqd)
 	return prio;
 }
 
-static void cfq_set_active_queue(struct cfq_data *cfqd)
+static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd)
 {
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue *cfqq;
+
+	/*
+	 * if current queue is expired but not done with its requests yet,
+	 * wait for that to happen
+	 */
+	if ((cfqq = cfqd->active_queue) != NULL) {
+		if (cfq_cfqq_expired(cfqq) && cfq_cfqq_dispatched(cfqq))
+			return NULL;
+	}
 
 	/*
 	 * if current list is non-empty, grab first entry. if it is empty,
@@ -802,50 +873,66 @@ static void cfq_set_active_queue(struct cfq_data *cfqd)
 	}
 
 	__cfq_set_active_queue(cfqd, cfqq);
+	return cfqq;
 }
 
 /*
  * current cfqq expired its slice (or was too idle), select new one
  */
-static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+static void
+__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
+		    int preempted)
 {
-	struct cfq_queue *cfqq = cfqd->active_queue;
-
-	if (cfqq) {
-		unsigned long now = jiffies;
+	unsigned long now = jiffies;
 
-		if (cfqq->wait_request)
-			del_timer(&cfqd->idle_slice_timer);
+	if (cfq_cfqq_wait_request(cfqq))
+		del_timer(&cfqd->idle_slice_timer);
 
-		if (!preempted && !cfqq->in_flight)
-			cfqq->service_last = now;
+	if (!preempted && !cfq_cfqq_dispatched(cfqq))
+		cfqq->service_last = now;
 
-		cfqq->must_dispatch = 0;
-		cfqq->wait_request = 0;
+	cfq_clear_cfqq_must_dispatch(cfqq);
+	cfq_clear_cfqq_wait_request(cfqq);
 
-		/*
-		 * store what was left of this slice, if the queue idled out
-		 * or was preempted
-		 */
-		if (time_after(now, cfqq->slice_end))
-			cfqq->slice_left = now - cfqq->slice_end;
-		else
-			cfqq->slice_left = 0;
+	/*
+	 * store what was left of this slice, if the queue idled out
+	 * or was preempted
+	 */
+	if (time_after(now, cfqq->slice_end))
+		cfqq->slice_left = now - cfqq->slice_end;
+	else
+		cfqq->slice_left = 0;
 
-		if (cfqq->on_rr)
-			cfq_resort_rr_list(cfqq, preempted);
+	if (cfq_cfqq_on_rr(cfqq))
+		cfq_resort_rr_list(cfqq, preempted);
 
+	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 
-		if (cfqd->active_cic) {
-			put_io_context(cfqd->active_cic->ioc);
-			cfqd->active_cic = NULL;
-		}
+	if (cfqd->active_cic) {
+		put_io_context(cfqd->active_cic->ioc);
+		cfqd->active_cic = NULL;
 	}
 
 	cfqd->dispatch_slice = 0;
 }
 
+static inline void cfq_slice_expired(struct cfq_data *cfqd, int preempted)
+{
+	struct cfq_queue *cfqq = cfqd->active_queue;
+
+	if (cfqq) {
+		/*
+		 * use deferred expiry, if there are requests in progress as
+		 * not to disturb the slice of the next queue
+		 */
+		if (cfq_cfqq_dispatched(cfqq))
+			cfq_mark_cfqq_expired(cfqq);
+		else
+			__cfq_slice_expired(cfqd, cfqq, preempted);
+	}
+}
+
 static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 
 {
@@ -857,7 +944,7 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 */
 	if (!cfqd->cfq_slice_idle)
 		return 0;
-	if (!cfqq->idle_window)
+	if (!cfq_cfqq_idle_window(cfqq))
 		return 0;
 	/*
 	 * task has exited, don't wait
@@ -865,13 +952,13 @@ static int cfq_arm_slice_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (cfqd->active_cic && !cfqd->active_cic->ioc->task)
 		return 0;
 
-	cfqq->wait_request = 1;
-	cfqq->must_alloc = 1;
+	cfq_mark_cfqq_must_dispatch(cfqq);
+	cfq_mark_cfqq_wait_request(cfqq);
 
 	if (!timer_pending(&cfqd->idle_slice_timer)) {
-		unsigned long slice_left = cfqq->slice_end - 1;
+		unsigned long slice_left = min(cfqq->slice_end - 1, (unsigned long) cfqd->cfq_slice_idle);
 
-		cfqd->idle_slice_timer.expires = min(jiffies + cfqd->cfq_slice_idle, slice_left);
+		cfqd->idle_slice_timer.expires = jiffies + slice_left;
 		add_timer(&cfqd->idle_slice_timer);
 	}
 
@@ -901,7 +988,7 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 			break;
 		if (!blk_fs_request(__rq))
 			break;
-		if (__crq->requeued)
+		if (cfq_crq_requeued(__crq))
 			break;
 
 		if (__rq->sector <= crq->request->sector)
@@ -920,9 +1007,10 @@ static void cfq_dispatch_sort(request_queue_t *q, struct cfq_rq *crq)
 	cfq_del_crq_rb(crq);
 	cfq_remove_merge_hints(q, crq);
 
-	crq->in_flight = 1;
-	crq->requeued = 0;
-	cfqq->in_flight++;
+	cfq_mark_crq_in_flight(crq);
+	cfq_clear_crq_requeued(crq);
+
+	cfqq->on_dispatch[cfq_crq_is_sync(crq)]++;
 	list_add_tail(&crq->request->queuelist, entry);
 }
 
@@ -935,16 +1023,16 @@ static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 	struct request *rq;
 	struct cfq_rq *crq;
 
-	if (cfqq->fifo_expire)
+	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
 
 	if (!list_empty(&cfqq->fifo)) {
-		int fifo = cfq_cfqq_sync(cfqq);
+		int fifo = cfq_cfqq_class_sync(cfqq);
 
 		crq = RQ_DATA(list_entry_fifo(cfqq->fifo.next));
 		rq = crq->request;
 		if (time_after(jiffies, rq->start_time + cfqd->cfq_fifo_expire[fifo])) {
-			cfqq->fifo_expire = 1;
+			cfq_mark_cfqq_fifo_expire(cfqq);
 			return crq;
 		}
 	}
@@ -953,7 +1041,9 @@ static inline struct cfq_rq *cfq_check_fifo(struct cfq_queue *cfqq)
 }
 
 /*
- * Scale schedule slice based on io priority
+ * Scale schedule slice based on io priority. Use the sync time slice only
+ * if a queue is marked sync and has sync io queued. A sync queue with async
+ * io only, should not get full sync slice length.
  */
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@ -981,6 +1071,16 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 
+/*
+ * scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing
+ */
+static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
+{
+	if (!cfqd->rq_in_driver && cfq_pending_requests(cfqd))
+		kblockd_schedule_work(&cfqd->unplug_work);
+}
+
 /*
  * get next queue for service
  */
@@ -993,11 +1093,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 	if (!cfqq)
 		goto new_queue;
 
+	if (cfq_cfqq_expired(cfqq))
+		goto new_queue;
+
 	/*
 	 * slice has expired
 	 */
-	if (!cfqq->must_dispatch && time_after(jiffies, cfqq->slice_end))
-		goto new_queue;
+	if (!cfq_cfqq_must_dispatch(cfqq) && time_after(now, cfqq->slice_end))
+		goto expire;
 
 	/*
 	 * if queue has requests, dispatch one. if not, check if
@@ -1005,17 +1108,18 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd, int force)
 	 */
 	if (!RB_EMPTY(&cfqq->sort_list))
 		goto keep_queue;
-	else if (!force && cfq_cfqq_sync(cfqq) &&
+	else if (!force && cfq_cfqq_class_sync(cfqq) &&
 		 time_before(now, cfqq->slice_end)) {
 		if (cfq_arm_slice_timer(cfqd, cfqq))
 			return NULL;
 	}
 
-new_queue:
+expire:
 	cfq_slice_expired(cfqd, 0);
-	cfq_set_active_queue(cfqd);
+new_queue:
+	cfqq = cfq_set_active_queue(cfqd);
 keep_queue:
-	return cfqd->active_queue;
+	return cfqq;
 }
 
 static int
@@ -1083,8 +1187,8 @@ cfq_dispatch_requests(request_queue_t *q, int max_dispatch, int force)
 
 	cfqq = cfq_select_queue(cfqd, force);
 	if (cfqq) {
-		cfqq->wait_request = 0;
-		cfqq->must_dispatch = 0;
+		cfq_clear_cfqq_must_dispatch(cfqq);
+		cfq_clear_cfqq_wait_request(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
 
 		if (cfq_class_idle(cfqq))
@@ -1108,10 +1212,10 @@ static inline void cfq_account_dispatch(struct cfq_rq *crq)
 	 * accounted bit is necessary since some drivers will call
 	 * elv_next_request() many times for the same request (eg ide)
 	 */
-	if (crq->accounted)
+	if (cfq_crq_in_driver(crq))
 		return;
 
-	crq->accounted = 1;
+	cfq_mark_crq_in_driver(crq);
 	cfqd->rq_in_driver++;
 }
 
@@ -1121,7 +1225,7 @@ cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	struct cfq_data *cfqd = cfqq->cfqd;
 	unsigned long now;
 
-	if (!crq->accounted)
+	if (!cfq_crq_in_driver(crq))
 		return;
 
 	now = jiffies;
@@ -1132,12 +1236,18 @@ cfq_account_completion(struct cfq_queue *cfqq, struct cfq_rq *crq)
 	if (!cfq_class_idle(cfqq))
 		cfqd->last_end_request = now;
 
-	if (!cfqq->in_flight && cfqq->on_rr) {
-		cfqq->service_last = now;
-		cfq_resort_rr_list(cfqq, 0);
+	if (!cfq_cfqq_dispatched(cfqq)) {
+		if (cfq_cfqq_on_rr(cfqq)) {
+			cfqq->service_last = now;
+			cfq_resort_rr_list(cfqq, 0);
+		}
+		if (cfq_cfqq_expired(cfqq)) {
+			__cfq_slice_expired(cfqd, cfqq, 0);
+			cfq_schedule_dispatch(cfqd);
+		}
 	}
 
-	if (crq->is_sync)
+	if (cfq_crq_is_sync(crq))
 		crq->io_context->last_end_request = now;
 }
 
@@ -1153,10 +1263,13 @@ dispatch:
 
 		crq = RQ_DATA(rq);
 		if (crq) {
+			struct cfq_queue *cfqq = crq->cfq_queue;
+
 			/*
 			 * if idle window is disabled, allow queue buildup
 			 */
-			if (!crq->in_flight && !crq->cfq_queue->idle_window &&
+			if (!cfq_crq_in_driver(crq) &&
+			    !cfq_cfqq_idle_window(cfqq) &&
 			    cfqd->rq_in_driver >= cfqd->cfq_max_depth)
 				return NULL;
 
@@ -1190,11 +1303,11 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
-	BUG_ON(cfqq->on_rr);
+	BUG_ON(cfq_cfqq_on_rr(cfqq));
 
 	if (unlikely(cfqd->active_queue == cfqq)) {
-		cfq_slice_expired(cfqd, 0);
-		kblockd_schedule_work(&cfqd->unplug_work);
+		__cfq_slice_expired(cfqd, cfqq, 0);
+		cfq_schedule_dispatch(cfqd);
 	}
 
 	cfq_put_cfqd(cfqq->cfqd);
@@ -1208,15 +1321,17 @@ static void cfq_put_queue(struct cfq_queue *cfqq)
 }
 
 static inline struct cfq_queue *
-__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
+__cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned int prio,
+		    const int hashval)
 {
 	struct hlist_head *hash_list = &cfqd->cfq_hash[hashval];
 	struct hlist_node *entry, *next;
 
 	hlist_for_each_safe(entry, next, hash_list) {
 		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+		const unsigned short __p = IOPRIO_PRIO_VALUE(__cfqq->ioprio_class, __cfqq->ioprio);
 
-		if (__cfqq->key == key)
+		if (__cfqq->key == key && (__p == prio || prio == CFQ_KEY_ANY))
 			return __cfqq;
 	}
 
@@ -1224,9 +1339,9 @@ __cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, const int hashval)
 }
 
 static struct cfq_queue *
-cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key)
+cfq_find_cfq_hash(struct cfq_data *cfqd, unsigned int key, unsigned short prio)
 {
-	return __cfq_find_cfq_hash(cfqd, key, hash_long(key, CFQ_QHASH_SHIFT));
+	return __cfq_find_cfq_hash(cfqd, key, prio, hash_long(key, CFQ_QHASH_SHIFT));
 }
 
 static void cfq_free_io_context(struct cfq_io_context *cic)
@@ -1255,8 +1370,8 @@ static void cfq_exit_single_io_context(struct cfq_io_context *cic)
 	spin_lock(q->queue_lock);
 
 	if (unlikely(cic->cfqq == cfqd->active_queue)) {
-		cfq_slice_expired(cfqd, 0);
-		kblockd_schedule_work(&cfqd->unplug_work);
+		__cfq_slice_expired(cfqd, cic->cfqq, 0);
+		cfq_schedule_dispatch(cfqd);
 	}
 
 	cfq_put_queue(cic->cfqq);
@@ -1313,7 +1428,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	struct task_struct *tsk = current;
 	int ioprio_class;
 
-	if (!cfqq->prio_changed)
+	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 
 	ioprio_class = IOPRIO_PRIO_CLASS(tsk->ioprio);
@@ -1338,7 +1453,7 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 		case IOPRIO_CLASS_IDLE:
 			cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
 			cfqq->ioprio = 7;
-			cfqq->idle_window = 0;
+			cfq_clear_cfqq_idle_window(cfqq);
 			break;
 	}
 
@@ -1349,10 +1464,10 @@ static void cfq_init_prio_data(struct cfq_queue *cfqq)
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 
-	if (cfqq->on_rr)
+	if (cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 
-	cfqq->prio_changed = 0;
+	cfq_clear_cfqq_prio_changed(cfqq);
 }
 
 static inline void changed_ioprio(struct cfq_queue *cfqq)
@@ -1361,7 +1476,7 @@ static inline void changed_ioprio(struct cfq_queue *cfqq)
 		struct cfq_data *cfqd = cfqq->cfqd;
 
 		spin_lock(cfqd->queue->queue_lock);
-		cfqq->prio_changed = 1;
+		cfq_mark_cfqq_prio_changed(cfqq);
 		cfq_init_prio_data(cfqq);
 		spin_unlock(cfqd->queue->queue_lock);
 	}
@@ -1383,13 +1498,14 @@ static int cfq_ioc_set_ioprio(struct io_context *ioc, unsigned int ioprio)
 }
 
 static struct cfq_queue *
-cfq_get_queue(struct cfq_data *cfqd, unsigned int key, int gfp_mask)
+cfq_get_queue(struct cfq_data *cfqd, unsigned int key, unsigned short ioprio,
+	      int gfp_mask)
 {
 	const int hashval = hash_long(key, CFQ_QHASH_SHIFT);
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 
 retry:
-	cfqq = __cfq_find_cfq_hash(cfqd, key, hashval);
+	cfqq = __cfq_find_cfq_hash(cfqd, key, ioprio, hashval);
 
 	if (!cfqq) {
 		if (new_cfqq) {
@@ -1423,10 +1539,9 @@ retry:
 		 * set ->slice_left to allow preemption for a new process
 		 */
 		cfqq->slice_left = 2 * cfqd->cfq_slice_idle;
-		cfqq->idle_window = 1;
-		cfqq->ioprio = -1;
-		cfqq->ioprio_class = -1;
-		cfqq->prio_changed = 1;
+		cfq_mark_cfqq_idle_window(cfqq);
+		cfq_mark_cfqq_prio_changed(cfqq);
+		cfq_init_prio_data(cfqq);
 	}
 
 	if (new_cfqq)
@@ -1553,7 +1668,7 @@ static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
-	int enable_idle = cfqq->idle_window;
+	int enable_idle = cfq_cfqq_idle_window(cfqq);
 
 	if (!cic->ioc->task || !cfqd->cfq_slice_idle)
 		enable_idle = 0;
@@ -1564,7 +1679,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			enable_idle = 1;
 	}
 
-	cfqq->idle_window = enable_idle;
+	if (enable_idle)
+		cfq_mark_cfqq_idle_window(cfqq);
+	else
+		cfq_clear_cfqq_idle_window(cfqq);
 }
 
 
@@ -1586,14 +1704,14 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 
 	if (cfq_class_idle(cfqq))
 		return 1;
-	if (!new_cfqq->wait_request)
+	if (!cfq_cfqq_wait_request(new_cfqq))
 		return 0;
 	/*
 	 * if it doesn't have slice left, forget it
 	 */
 	if (new_cfqq->slice_left < cfqd->cfq_slice_idle)
 		return 0;
-	if (crq->is_sync && !cfq_cfqq_sync(cfqq))
+	if (cfq_crq_is_sync(crq) && !cfq_cfqq_sync(cfqq))
 		return 1;
 
 	return 0;
@@ -1614,7 +1732,7 @@ static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		cfqq->slice_left = cfq_prio_to_slice(cfqd, cfqq) / 2;
 
 	cfqq->slice_end = cfqq->slice_left + jiffies;
-	cfq_slice_expired(cfqd, 1);
+	__cfq_slice_expired(cfqd, cfqq, 1);
 	__cfq_set_active_queue(cfqd, cfqq);
 }
 
@@ -1639,7 +1757,7 @@ static void
 cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 struct cfq_rq *crq)
 {
-	const int sync = crq->is_sync;
+	const int sync = cfq_crq_is_sync(crq);
 
 	cfqq->next_crq = cfq_choose_req(cfqd, cfqq->next_crq, crq);
 
@@ -1658,8 +1776,8 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * immediately and flag that we must not expire this queue
 		 * just now
 		 */
-		if (cfqq->wait_request) {
-			cfqq->must_dispatch = 1;
+		if (cfq_cfqq_wait_request(cfqq)) {
+			cfq_mark_cfqq_must_dispatch(cfqq);
 			del_timer(&cfqd->idle_slice_timer);
 			cfq_start_queueing(cfqd, cfqq);
 		}
@@ -1670,7 +1788,7 @@ cfq_crq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * has some old slice time left and is of higher priority
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		cfqq->must_dispatch = 1;
+		cfq_mark_cfqq_must_dispatch(cfqq);
 		cfq_start_queueing(cfqd, cfqq);
 	}
 }
@@ -1713,7 +1831,7 @@ cfq_insert_request(request_queue_t *q, struct request *rq, int where)
 			 * be kicked by __make_request() afterward.
 			 * Kick it here.
 			 */
-			kblockd_schedule_work(&cfqd->unplug_work);
+			cfq_schedule_dispatch(cfqd);
 			break;
 		case ELEVATOR_INSERT_FRONT:
 			list_add(&rq->queuelist, &q->queue_head);
@@ -1750,9 +1868,11 @@ static void cfq_completed_request(request_queue_t *q, struct request *rq)
 
 	cfqq = crq->cfq_queue;
 
-	if (crq->in_flight) {
-		WARN_ON(!cfqq->in_flight);
-		cfqq->in_flight--;
+	if (cfq_crq_in_flight(crq)) {
+		const int sync = cfq_crq_is_sync(crq);
+
+		WARN_ON(!cfqq->on_dispatch[sync]);
+		cfqq->on_dispatch[sync]--;
 	}
 
 	cfq_account_completion(cfqq, crq);
@@ -1814,7 +1934,7 @@ static void cfq_prio_boost(struct cfq_queue *cfqq)
 	 * refile between round-robin lists if we moved the priority class
 	 */
 	if ((ioprio_class != cfqq->ioprio_class || ioprio != cfqq->ioprio) &&
-	    cfqq->on_rr)
+	    cfq_cfqq_on_rr(cfqq))
 		cfq_resort_rr_list(cfqq, 0);
 }
 
@@ -1830,23 +1950,27 @@ static inline int
 __cfq_may_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct task_struct *task, int rw)
 {
-	if (cfqq->wait_request && cfqq->must_alloc)
+#if 1
+	if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
+	    !cfq_cfqq_must_alloc_slice) {
+		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
+	}
 
 	return ELV_MQUEUE_MAY;
-#if 0
+#else
 	if (!cfqq || task->flags & PF_MEMALLOC)
 		return ELV_MQUEUE_MAY;
-	if (!cfqq->allocated[rw] || cfqq->must_alloc) {
-		if (cfqq->wait_request)
+	if (!cfqq->allocated[rw] || cfq_cfqq_must_alloc(cfqq)) {
+		if (cfq_cfqq_wait_request(cfqq))
 			return ELV_MQUEUE_MUST;
 
 		/*
 		 * only allow 1 ELV_MQUEUE_MUST per slice, otherwise we
 		 * can quickly flood the queue with writes from a single task
 		 */
-		if (rw == READ || !cfqq->must_alloc_slice) {
-			cfqq->must_alloc_slice = 1;
+		if (rw == READ || !cfq_cfqq_must_alloc_slice) {
+			cfq_mark_cfqq_must_alloc_slice(cfqq);
 			return ELV_MQUEUE_MUST;
 		}
 
@@ -1881,7 +2005,7 @@ static int cfq_may_queue(request_queue_t *q, int rw, struct bio *bio)
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
-	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw));
+	cfqq = cfq_find_cfq_hash(cfqd, cfq_queue_pid(tsk, rw), tsk->ioprio);
 	if (cfqq) {
 		cfq_init_prio_data(cfqq);
 		cfq_prio_boost(cfqq);
@@ -1943,15 +2067,17 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		int gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
+	pid_t key = cfq_queue_pid(tsk, rw);
 	struct cfq_queue *cfqq;
 	struct cfq_rq *crq;
 	unsigned long flags;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
-	cic = cfq_get_io_context(cfqd, cfq_queue_pid(current, rw), gfp_mask);
+	cic = cfq_get_io_context(cfqd, key, gfp_mask);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -1959,7 +2085,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		goto queue_fail;
 
 	if (!cic->cfqq) {
-		cfqq = cfq_get_queue(cfqd, current->pid, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, key, tsk->ioprio, gfp_mask);
 		if (!cfqq)
 			goto queue_fail;
 
@@ -1968,7 +2094,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		cfqq = cic->cfqq;
 
 	cfqq->allocated[rw]++;
-	cfqq->must_alloc = 0;
+	cfq_clear_cfqq_must_alloc(cfqq);
 	cfqd->rq_starved = 0;
 	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
@@ -1981,9 +2107,15 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 		INIT_HLIST_NODE(&crq->hash);
 		crq->cfq_queue = cfqq;
 		crq->io_context = cic;
-		crq->in_flight = crq->accounted = 0;
-		crq->is_sync = (rw == READ || process_sync(current));
-		crq->requeued = 0;
+		cfq_clear_crq_in_flight(crq);
+		cfq_clear_crq_in_driver(crq);
+		cfq_clear_crq_requeued(crq);
+
+		if (rw == READ || process_sync(tsk))
+			cfq_mark_crq_is_sync(crq);
+		else
+			cfq_clear_crq_is_sync(crq);
+
 		rq->elevator_private = crq;
 		return 0;
 	}
@@ -1991,7 +2123,7 @@ cfq_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
 	spin_lock_irqsave(q->queue_lock, flags);
 	cfqq->allocated[rw]--;
 	if (!(cfqq->allocated[0] + cfqq->allocated[1]))
-		cfqq->must_alloc = 1;
+		cfq_mark_cfqq_must_alloc(cfqq);
 	cfq_put_queue(cfqq);
 queue_fail:
 	if (cic)
@@ -2002,7 +2134,7 @@ queue_fail:
 	 * that would be an extremely rare OOM situation
 	 */
 	cfqd->rq_starved = 1;
-	kblockd_schedule_work(&cfqd->unplug_work);
+	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 1;
 }
@@ -2068,15 +2200,14 @@ static void cfq_idle_slice_timer(unsigned long data)
 		 * not expired and it has a request pending, let it dispatch
 		 */
 		if (!RB_EMPTY(&cfqq->sort_list)) {
-			cfqq->must_dispatch = 1;
+			cfq_mark_cfqq_must_dispatch(cfqq);
 			goto out_kick;
 		}
 	}
 expire:
 	cfq_slice_expired(cfqd, 0);
 out_kick:
-	if (cfq_pending_requests(cfqd))
-		kblockd_schedule_work(&cfqd->unplug_work);
+	cfq_schedule_dispatch(cfqd);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
@@ -2099,11 +2230,17 @@ static void cfq_idle_class_timer(unsigned long data)
 		cfqd->idle_class_timer.expires = end;
 		add_timer(&cfqd->idle_class_timer);
 	} else
-		kblockd_schedule_work(&cfqd->unplug_work);
+		cfq_schedule_dispatch(cfqd);
 
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 
+static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
+{
+	del_timer_sync(&cfqd->idle_slice_timer);
+	del_timer_sync(&cfqd->idle_class_timer);
+	blk_sync_queue(cfqd->queue);
+}
 
 static void cfq_put_cfqd(struct cfq_data *cfqd)
 {
@@ -2112,7 +2249,7 @@ static void cfq_put_cfqd(struct cfq_data *cfqd)
 	if (!atomic_dec_and_test(&cfqd->ref))
 		return;
 
-	blk_sync_queue(q);
+	cfq_shutdown_timer_wq(cfqd);
 
 	blk_put_queue(q);
 
@@ -2126,8 +2263,7 @@ static void cfq_exit_queue(elevator_t *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 
-	del_timer_sync(&cfqd->idle_slice_timer);
-	del_timer_sync(&cfqd->idle_class_timer);
+	cfq_shutdown_timer_wq(cfqd);
 	cfq_put_cfqd(cfqd);
 }
 
@@ -2198,6 +2334,7 @@ static int cfq_init_queue(request_queue_t *q, elevator_t *e)
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_max_depth = cfq_max_depth;
+
 	return 0;
 out_crqpool:
 	kfree(cfqd->cfq_hash);
@@ -2369,6 +2506,7 @@ static struct cfq_fs_entry cfq_max_depth_entry = {
 	.show = cfq_max_depth_show,
 	.store = cfq_max_depth_store,
 };
+
 static struct attribute *default_attrs[] = {
 	&cfq_quantum_entry.attr,
 	&cfq_queued_entry.attr,
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 7811300d88ee..8a453a0b5e4b 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -12,6 +12,7 @@
 
 #define IOPRIO_PRIO_CLASS(mask)	((mask) >> IOPRIO_CLASS_SHIFT)
 #define IOPRIO_PRIO_DATA(mask)	((mask) & IOPRIO_PRIO_MASK)
+#define IOPRIO_PRIO_VALUE(class, data)	(((class) << IOPRIO_CLASS_SHIFT) | data)
 
 #define ioprio_valid(mask)	(IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
 
-- 
cgit v1.2.3-59-g8ed1b


From 4808a1c0261176f9c7e28e7f108d41a381a7d0fc Mon Sep 17 00:00:00 2001
From: Olav Kongas <ok@artecdesign.ee>
Date: Sat, 9 Apr 2005 22:57:39 +0300
Subject: [PATCH] USB: Add isp116x-hcd USB host controller driver

This patch provides an "isp116x-hcd" driver for Philips'
ISP1160/ISP1161 USB host controllers.

The driver:
 - is relatively small, meant for use on embedded platforms.
 - runs usbtests 1-14 without problems for days.
 - has been in use by 6-7 different people on ARM and PPC platforms,
   running a range of devices including USB hubs.
 - supports suspend/resume of both the platform device and the root hub;
   supports remote wakeup of the root hub (but NOT the platform device)
   by USB devices.
 - does NOT support ISO transfers (nobody has asked for them).
 - is PIO-only.

Signed-off-by: Olav Kongas <ok@artecdesign.ee>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/Makefile           |    1 +
 drivers/usb/host/Kconfig       |   13 +
 drivers/usb/host/Makefile      |    1 +
 drivers/usb/host/isp116x-hcd.c | 1882 ++++++++++++++++++++++++++++++++++++++++
 drivers/usb/host/isp116x.h     |  583 +++++++++++++
 include/linux/usb_isp116x.h    |   47 +
 6 files changed, 2527 insertions(+)
 create mode 100644 drivers/usb/host/isp116x-hcd.c
 create mode 100644 drivers/usb/host/isp116x.h
 create mode 100644 include/linux/usb_isp116x.h

(limited to 'include/linux')

diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile
index a61d4433a989..c149c06388be 100644
--- a/drivers/usb/Makefile
+++ b/drivers/usb/Makefile
@@ -9,6 +9,7 @@ obj-$(CONFIG_USB)		+= core/
 obj-$(CONFIG_USB_MON)		+= mon/
 
 obj-$(CONFIG_USB_EHCI_HCD)	+= host/
+obj-$(CONFIG_USB_ISP116X_HCD)	+= host/
 obj-$(CONFIG_USB_OHCI_HCD)	+= host/
 obj-$(CONFIG_USB_UHCI_HCD)	+= host/
 obj-$(CONFIG_USB_SL811_HCD)	+= host/
diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
index 19e598c9641f..ed1899d307db 100644
--- a/drivers/usb/host/Kconfig
+++ b/drivers/usb/host/Kconfig
@@ -49,6 +49,19 @@ config USB_EHCI_ROOT_HUB_TT
 
 	  This supports the EHCI implementation from TransDimension Inc.
 
+config USB_ISP116X_HCD
+	tristate "ISP116X HCD support"
+	depends on USB
+	default N
+	---help---
+	  The ISP1160 and ISP1161 chips are USB host controllers. Enable this
+	  option if your board has this chip. If unsure, say N.
+
+	  This driver does not support isochronous transfers.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called isp116x-hcd.
+
 config USB_OHCI_HCD
 	tristate "OHCI HCD support"
 	depends on USB && USB_ARCH_HAS_OHCI
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 5dbd3e7a27c7..350d14fc1cc9 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -4,6 +4,7 @@
 #
 
 obj-$(CONFIG_USB_EHCI_HCD)	+= ehci-hcd.o
+obj-$(CONFIG_USB_ISP116X_HCD)	+= isp116x-hcd.o
 obj-$(CONFIG_USB_OHCI_HCD)	+= ohci-hcd.o
 obj-$(CONFIG_USB_UHCI_HCD)	+= uhci-hcd.o
 obj-$(CONFIG_USB_SL811_HCD)	+= sl811-hcd.o
diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
new file mode 100644
index 000000000000..69e7433d9ce8
--- /dev/null
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -0,0 +1,1882 @@
+/*
+ * ISP116x HCD (Host Controller Driver) for USB.
+ *
+ * Derived from the SL811 HCD, rewritten for ISP116x.
+ * Copyright (C) 2005 Olav Kongas <ok@artecdesign.ee>
+ *
+ * Portions:
+ * Copyright (C) 2004 Psion Teklogix (for NetBook PRO)
+ * Copyright (C) 2004 David Brownell
+ *
+ * Periodic scheduling is based on Roman's OHCI code
+ * Copyright (C) 1999 Roman Weissgaerber
+ *
+ */
+
+/*
+ * The driver basically works. A number of people have used it with a range
+ * of devices.
+ *
+ *The driver passes all usbtests 1-14.
+ *
+ * Suspending/resuming of root hub via sysfs works. Remote wakeup works too.
+ * And suspending/resuming of platform device works too. Suspend/resume
+ * via HCD operations vector is not implemented.
+ *
+ * Iso transfer support is not implemented. Adding this would include
+ * implementing recovery from the failure to service the processed ITL
+ * fifo ram in time, which will involve chip reset.
+ *
+ * TODO:
+ + More testing of suspend/resume.
+*/
+
+/*
+  ISP116x chips require certain delays between accesses to its
+  registers. The following timing options exist.
+
+  1. Configure your memory controller (the best)
+  2. Implement platform-specific delay function possibly
+  combined with configuring the memory controller; see
+  include/linux/usb-isp116x.h for more info. Some broken
+  memory controllers line LH7A400 SMC need this. Also,
+  uncomment for that to work the following
+  USE_PLATFORM_DELAY macro.
+  3. Use ndelay (easiest, poorest). For that, uncomment
+  the following USE_NDELAY macro.
+*/
+#define USE_PLATFORM_DELAY
+//#define USE_NDELAY
+
+//#define DEBUG
+//#define VERBOSE
+/* Transfer descriptors. See dump_ptd() for printout format  */
+//#define PTD_TRACE
+/* enqueuing/finishing log of urbs */
+//#define URB_TRACE
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/ioport.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/interrupt.h>
+#include <linux/usb.h>
+#include <linux/usb_isp116x.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/system.h>
+#include <asm/byteorder.h>
+
+#ifndef DEBUG
+#	define	STUB_DEBUG_FILE
+#endif
+
+#include "../core/hcd.h"
+#include "isp116x.h"
+
+#define DRIVER_VERSION	"08 Apr 2005"
+#define DRIVER_DESC	"ISP116x USB Host Controller Driver"
+
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
+
+static const char hcd_name[] = "isp116x-hcd";
+
+/*-----------------------------------------------------------------*/
+
+/*
+  Write len bytes to fifo, pad till 32-bit boundary
+ */
+static void write_ptddata_to_fifo(struct isp116x *isp116x, void *buf, int len)
+{
+	u8 *dp = (u8 *) buf;
+	u16 *dp2 = (u16 *) buf;
+	u16 w;
+	int quot = len % 4;
+
+	if ((unsigned long)dp2 & 1) {
+		/* not aligned */
+		for (; len > 1; len -= 2) {
+			w = *dp++;
+			w |= *dp++ << 8;
+			isp116x_raw_write_data16(isp116x, w);
+		}
+		if (len)
+			isp116x_write_data16(isp116x, (u16) * dp);
+	} else {
+		/* aligned */
+		for (; len > 1; len -= 2)
+			isp116x_raw_write_data16(isp116x, *dp2++);
+		if (len)
+			isp116x_write_data16(isp116x, 0xff & *((u8 *) dp2));
+	}
+	if (quot == 1 || quot == 2)
+		isp116x_raw_write_data16(isp116x, 0);
+}
+
+/*
+  Read len bytes from fifo and then read till 32-bit boundary.
+ */
+static void read_ptddata_from_fifo(struct isp116x *isp116x, void *buf, int len)
+{
+	u8 *dp = (u8 *) buf;
+	u16 *dp2 = (u16 *) buf;
+	u16 w;
+	int quot = len % 4;
+
+	if ((unsigned long)dp2 & 1) {
+		/* not aligned */
+		for (; len > 1; len -= 2) {
+			w = isp116x_raw_read_data16(isp116x);
+			*dp++ = w & 0xff;
+			*dp++ = (w >> 8) & 0xff;
+		}
+		if (len)
+			*dp = 0xff & isp116x_read_data16(isp116x);
+	} else {
+		/* aligned */
+		for (; len > 1; len -= 2)
+			*dp2++ = isp116x_raw_read_data16(isp116x);
+		if (len)
+			*(u8 *) dp2 = 0xff & isp116x_read_data16(isp116x);
+	}
+	if (quot == 1 || quot == 2)
+		isp116x_raw_read_data16(isp116x);
+}
+
+/*
+  Write ptd's and data for scheduled transfers into
+  the fifo ram. Fifo must be empty and ready.
+*/
+static void pack_fifo(struct isp116x *isp116x)
+{
+	struct isp116x_ep *ep;
+	struct ptd *ptd;
+	int buflen = isp116x->atl_last_dir == PTD_DIR_IN
+	    ? isp116x->atl_bufshrt : isp116x->atl_buflen;
+	int ptd_count = 0;
+
+	isp116x_write_reg16(isp116x, HCuPINT, HCuPINT_AIIEOT);
+	isp116x_write_reg16(isp116x, HCXFERCTR, buflen);
+	isp116x_write_addr(isp116x, HCATLPORT | ISP116x_WRITE_OFFSET);
+	for (ep = isp116x->atl_active; ep; ep = ep->active) {
+		++ptd_count;
+		ptd = &ep->ptd;
+		dump_ptd(ptd);
+		dump_ptd_out_data(ptd, ep->data);
+		isp116x_write_data16(isp116x, ptd->count);
+		isp116x_write_data16(isp116x, ptd->mps);
+		isp116x_write_data16(isp116x, ptd->len);
+		isp116x_write_data16(isp116x, ptd->faddr);
+		buflen -= sizeof(struct ptd);
+		/* Skip writing data for last IN PTD */
+		if (ep->active || (isp116x->atl_last_dir != PTD_DIR_IN)) {
+			write_ptddata_to_fifo(isp116x, ep->data, ep->length);
+			buflen -= ALIGN(ep->length, 4);
+		}
+	}
+	BUG_ON(buflen);
+}
+
+/*
+  Read the processed ptd's and data from fifo ram back to
+  URBs' buffers. Fifo must be full and done
+*/
+static void unpack_fifo(struct isp116x *isp116x)
+{
+	struct isp116x_ep *ep;
+	struct ptd *ptd;
+	int buflen = isp116x->atl_last_dir == PTD_DIR_IN
+	    ? isp116x->atl_buflen : isp116x->atl_bufshrt;
+
+	isp116x_write_reg16(isp116x, HCuPINT, HCuPINT_AIIEOT);
+	isp116x_write_reg16(isp116x, HCXFERCTR, buflen);
+	isp116x_write_addr(isp116x, HCATLPORT);
+	for (ep = isp116x->atl_active; ep; ep = ep->active) {
+		ptd = &ep->ptd;
+		ptd->count = isp116x_read_data16(isp116x);
+		ptd->mps = isp116x_read_data16(isp116x);
+		ptd->len = isp116x_read_data16(isp116x);
+		ptd->faddr = isp116x_read_data16(isp116x);
+		buflen -= sizeof(struct ptd);
+		/* Skip reading data for last Setup or Out PTD */
+		if (ep->active || (isp116x->atl_last_dir == PTD_DIR_IN)) {
+			read_ptddata_from_fifo(isp116x, ep->data, ep->length);
+			buflen -= ALIGN(ep->length, 4);
+		}
+		dump_ptd(ptd);
+		dump_ptd_in_data(ptd, ep->data);
+	}
+	BUG_ON(buflen);
+}
+
+/*---------------------------------------------------------------*/
+
+/*
+  Set up PTD's.
+*/
+static void preproc_atl_queue(struct isp116x *isp116x)
+{
+	struct isp116x_ep *ep;
+	struct urb *urb;
+	struct ptd *ptd;
+	u16 toggle, dir, len;
+
+	for (ep = isp116x->atl_active; ep; ep = ep->active) {
+		BUG_ON(list_empty(&ep->hep->urb_list));
+		urb = container_of(ep->hep->urb_list.next,
+				   struct urb, urb_list);
+		ptd = &ep->ptd;
+		len = ep->length;
+		spin_lock(&urb->lock);
+		ep->data = (unsigned char *)urb->transfer_buffer
+		    + urb->actual_length;
+
+		switch (ep->nextpid) {
+		case USB_PID_IN:
+			toggle = usb_gettoggle(urb->dev, ep->epnum, 0);
+			dir = PTD_DIR_IN;
+			break;
+		case USB_PID_OUT:
+			toggle = usb_gettoggle(urb->dev, ep->epnum, 1);
+			dir = PTD_DIR_OUT;
+			break;
+		case USB_PID_SETUP:
+			toggle = 0;
+			dir = PTD_DIR_SETUP;
+			len = sizeof(struct usb_ctrlrequest);
+			ep->data = urb->setup_packet;
+			break;
+		case USB_PID_ACK:
+			toggle = 1;
+			len = 0;
+			dir = (urb->transfer_buffer_length
+			       && usb_pipein(urb->pipe))
+			    ? PTD_DIR_OUT : PTD_DIR_IN;
+			break;
+		default:
+			/* To please gcc */
+			toggle = dir = 0;
+			ERR("%s %d: ep->nextpid %d\n", __func__, __LINE__,
+			    ep->nextpid);
+			BUG_ON(1);
+		}
+
+		ptd->count = PTD_CC_MSK | PTD_ACTIVE_MSK | PTD_TOGGLE(toggle);
+		ptd->mps = PTD_MPS(ep->maxpacket)
+		    | PTD_SPD(urb->dev->speed == USB_SPEED_LOW)
+		    | PTD_EP(ep->epnum);
+		ptd->len = PTD_LEN(len) | PTD_DIR(dir);
+		ptd->faddr = PTD_FA(usb_pipedevice(urb->pipe));
+		spin_unlock(&urb->lock);
+		if (!ep->active) {
+			ptd->mps |= PTD_LAST_MSK;
+			isp116x->atl_last_dir = dir;
+		}
+		isp116x->atl_bufshrt = sizeof(struct ptd) + isp116x->atl_buflen;
+		isp116x->atl_buflen = isp116x->atl_bufshrt + ALIGN(len, 4);
+	}
+}
+
+/*
+  Analyze transfer results, handle partial transfers and errors
+*/
+static void postproc_atl_queue(struct isp116x *isp116x)
+{
+	struct isp116x_ep *ep;
+	struct urb *urb;
+	struct usb_device *udev;
+	struct ptd *ptd;
+	int short_not_ok;
+	u8 cc;
+
+	for (ep = isp116x->atl_active; ep; ep = ep->active) {
+		BUG_ON(list_empty(&ep->hep->urb_list));
+		urb =
+		    container_of(ep->hep->urb_list.next, struct urb, urb_list);
+		udev = urb->dev;
+		ptd = &ep->ptd;
+		cc = PTD_GET_CC(ptd);
+
+		spin_lock(&urb->lock);
+		short_not_ok = 1;
+
+		/* Data underrun is special. For allowed underrun
+		   we clear the error and continue as normal. For
+		   forbidden underrun we finish the DATA stage
+		   immediately while for control transfer,
+		   we do a STATUS stage. */
+		if (cc == TD_DATAUNDERRUN) {
+			if (!(urb->transfer_flags & URB_SHORT_NOT_OK)) {
+				DBG("Allowed data underrun\n");
+				cc = TD_CC_NOERROR;
+				short_not_ok = 0;
+			} else {
+				ep->error_count = 1;
+				if (usb_pipecontrol(urb->pipe))
+					ep->nextpid = USB_PID_ACK;
+				else
+					usb_settoggle(udev, ep->epnum,
+						      ep->nextpid ==
+						      USB_PID_OUT,
+						      PTD_GET_TOGGLE(ptd) ^ 1);
+				urb->status = cc_to_error[TD_DATAUNDERRUN];
+				spin_unlock(&urb->lock);
+				continue;
+			}
+		}
+		/* Keep underrun error through the STATUS stage */
+		if (urb->status == cc_to_error[TD_DATAUNDERRUN])
+			cc = TD_DATAUNDERRUN;
+
+		if (cc != TD_CC_NOERROR && cc != TD_NOTACCESSED
+		    && (++ep->error_count >= 3 || cc == TD_CC_STALL
+			|| cc == TD_DATAOVERRUN)) {
+			if (urb->status == -EINPROGRESS)
+				urb->status = cc_to_error[cc];
+			if (ep->nextpid == USB_PID_ACK)
+				ep->nextpid = 0;
+			spin_unlock(&urb->lock);
+			continue;
+		}
+		/* According to usb spec, zero-length Int transfer signals
+		   finishing of the urb. Hey, does this apply only
+		   for IN endpoints? */
+		if (usb_pipeint(urb->pipe) && !PTD_GET_LEN(ptd)) {
+			if (urb->status == -EINPROGRESS)
+				urb->status = 0;
+			spin_unlock(&urb->lock);
+			continue;
+		}
+
+		/* Relax after previously failed, but later succeeded
+		   or correctly NAK'ed retransmission attempt */
+		if (ep->error_count
+		    && (cc == TD_CC_NOERROR || cc == TD_NOTACCESSED))
+			ep->error_count = 0;
+
+		/* Take into account idiosyncracies of the isp116x chip
+		   regarding toggle bit for failed transfers */
+		if (ep->nextpid == USB_PID_OUT)
+			usb_settoggle(udev, ep->epnum, 1, PTD_GET_TOGGLE(ptd)
+				      ^ (ep->error_count > 0));
+		else if (ep->nextpid == USB_PID_IN)
+			usb_settoggle(udev, ep->epnum, 0, PTD_GET_TOGGLE(ptd)
+				      ^ (ep->error_count > 0));
+
+		switch (ep->nextpid) {
+		case USB_PID_IN:
+		case USB_PID_OUT:
+			urb->actual_length += PTD_GET_COUNT(ptd);
+			if (PTD_GET_ACTIVE(ptd)
+			    || (cc != TD_CC_NOERROR && cc < 0x0E))
+				break;
+			if (urb->transfer_buffer_length != urb->actual_length) {
+				if (short_not_ok)
+					break;
+			} else {
+				if (urb->transfer_flags & URB_ZERO_PACKET
+				    && ep->nextpid == USB_PID_OUT
+				    && !(PTD_GET_COUNT(ptd) % ep->maxpacket)) {
+					DBG("Zero packet requested\n");
+					break;
+				}
+			}
+			/* All data for this URB is transferred, let's finish */
+			if (usb_pipecontrol(urb->pipe))
+				ep->nextpid = USB_PID_ACK;
+			else if (urb->status == -EINPROGRESS)
+				urb->status = 0;
+			break;
+		case USB_PID_SETUP:
+			if (PTD_GET_ACTIVE(ptd)
+			    || (cc != TD_CC_NOERROR && cc < 0x0E))
+				break;
+			if (urb->transfer_buffer_length == urb->actual_length)
+				ep->nextpid = USB_PID_ACK;
+			else if (usb_pipeout(urb->pipe)) {
+				usb_settoggle(udev, 0, 1, 1);
+				ep->nextpid = USB_PID_OUT;
+			} else {
+				usb_settoggle(udev, 0, 0, 1);
+				ep->nextpid = USB_PID_IN;
+			}
+			break;
+		case USB_PID_ACK:
+			if (PTD_GET_ACTIVE(ptd)
+			    || (cc != TD_CC_NOERROR && cc < 0x0E))
+				break;
+			if (urb->status == -EINPROGRESS)
+				urb->status = 0;
+			ep->nextpid = 0;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		spin_unlock(&urb->lock);
+	}
+}
+
+/*
+  Take done or failed requests out of schedule. Give back
+  processed urbs.
+*/
+static void finish_request(struct isp116x *isp116x, struct isp116x_ep *ep,
+			   struct urb *urb, struct pt_regs *regs)
+__releases(isp116x->lock) __acquires(isp116x->lock)
+{
+	unsigned i;
+
+	urb->hcpriv = NULL;
+	ep->error_count = 0;
+
+	if (usb_pipecontrol(urb->pipe))
+		ep->nextpid = USB_PID_SETUP;
+
+	urb_dbg(urb, "Finish");
+
+	spin_unlock(&isp116x->lock);
+	usb_hcd_giveback_urb(isp116x_to_hcd(isp116x), urb, regs);
+	spin_lock(&isp116x->lock);
+
+	/* take idle endpoints out of the schedule */
+	if (!list_empty(&ep->hep->urb_list))
+		return;
+
+	/* async deschedule */
+	if (!list_empty(&ep->schedule)) {
+		list_del_init(&ep->schedule);
+		return;
+	}
+
+	/* periodic deschedule */
+	DBG("deschedule qh%d/%p branch %d\n", ep->period, ep, ep->branch);
+	for (i = ep->branch; i < PERIODIC_SIZE; i += ep->period) {
+		struct isp116x_ep *temp;
+		struct isp116x_ep **prev = &isp116x->periodic[i];
+
+		while (*prev && ((temp = *prev) != ep))
+			prev = &temp->next;
+		if (*prev)
+			*prev = ep->next;
+		isp116x->load[i] -= ep->load;
+	}
+	ep->branch = PERIODIC_SIZE;
+	isp116x_to_hcd(isp116x)->self.bandwidth_allocated -=
+	    ep->load / ep->period;
+
+	/* switch irq type? */
+	if (!--isp116x->periodic_count) {
+		isp116x->irqenb &= ~HCuPINT_SOF;
+		isp116x->irqenb |= HCuPINT_ATL;
+	}
+}
+
+/*
+  Scan transfer lists, schedule transfers, send data off
+  to chip.
+ */
+static void start_atl_transfers(struct isp116x *isp116x)
+{
+	struct isp116x_ep *last_ep = NULL, *ep;
+	struct urb *urb;
+	u16 load = 0;
+	int len, index, speed, byte_time;
+
+	if (atomic_read(&isp116x->atl_finishing))
+		return;
+
+	if (!HC_IS_RUNNING(isp116x_to_hcd(isp116x)->state))
+		return;
+
+	/* FIFO not empty? */
+	if (isp116x_read_reg16(isp116x, HCBUFSTAT) & HCBUFSTAT_ATL_FULL)
+		return;
+
+	isp116x->atl_active = NULL;
+	isp116x->atl_buflen = isp116x->atl_bufshrt = 0;
+
+	/* Schedule int transfers */
+	if (isp116x->periodic_count) {
+		isp116x->fmindex = index =
+		    (isp116x->fmindex + 1) & (PERIODIC_SIZE - 1);
+		if ((load = isp116x->load[index])) {
+			/* Bring all int transfers for this frame
+			   into the active queue */
+			isp116x->atl_active = last_ep =
+			    isp116x->periodic[index];
+			while (last_ep->next)
+				last_ep = (last_ep->active = last_ep->next);
+			last_ep->active = NULL;
+		}
+	}
+
+	/* Schedule control/bulk transfers */
+	list_for_each_entry(ep, &isp116x->async, schedule) {
+		urb = container_of(ep->hep->urb_list.next,
+				   struct urb, urb_list);
+		speed = urb->dev->speed;
+		byte_time = speed == USB_SPEED_LOW
+		    ? BYTE_TIME_LOWSPEED : BYTE_TIME_FULLSPEED;
+
+		if (ep->nextpid == USB_PID_SETUP) {
+			len = sizeof(struct usb_ctrlrequest);
+		} else if (ep->nextpid == USB_PID_ACK) {
+			len = 0;
+		} else {
+			/* Find current free length ... */
+			len = (MAX_LOAD_LIMIT - load) / byte_time;
+
+			/* ... then limit it to configured max size ... */
+			len = min(len, speed == USB_SPEED_LOW ?
+				  MAX_TRANSFER_SIZE_LOWSPEED :
+				  MAX_TRANSFER_SIZE_FULLSPEED);
+
+			/* ... and finally cut to the multiple of MaxPacketSize,
+			   or to the real length if there's enough room. */
+			if (len <
+			    (urb->transfer_buffer_length -
+			     urb->actual_length)) {
+				len -= len % ep->maxpacket;
+				if (!len)
+					continue;
+			} else
+				len = urb->transfer_buffer_length -
+				    urb->actual_length;
+			BUG_ON(len < 0);
+		}
+
+		load += len * byte_time;
+		if (load > MAX_LOAD_LIMIT)
+			break;
+
+		ep->active = NULL;
+		ep->length = len;
+		if (last_ep)
+			last_ep->active = ep;
+		else
+			isp116x->atl_active = ep;
+		last_ep = ep;
+	}
+
+	/* Avoid starving of endpoints */
+	if ((&isp116x->async)->next != (&isp116x->async)->prev)
+		list_move(&isp116x->async, (&isp116x->async)->next);
+
+	if (isp116x->atl_active) {
+		preproc_atl_queue(isp116x);
+		pack_fifo(isp116x);
+	}
+}
+
+/*
+  Finish the processed transfers
+*/
+static void finish_atl_transfers(struct isp116x *isp116x, struct pt_regs *regs)
+{
+	struct isp116x_ep *ep;
+	struct urb *urb;
+
+	if (!isp116x->atl_active)
+		return;
+	/* Fifo not ready? */
+	if (!(isp116x_read_reg16(isp116x, HCBUFSTAT) & HCBUFSTAT_ATL_DONE))
+		return;
+
+	atomic_inc(&isp116x->atl_finishing);
+	unpack_fifo(isp116x);
+	postproc_atl_queue(isp116x);
+	for (ep = isp116x->atl_active; ep; ep = ep->active) {
+		urb =
+		    container_of(ep->hep->urb_list.next, struct urb, urb_list);
+		/* USB_PID_ACK check here avoids finishing of
+		   control transfers, for which TD_DATAUNDERRUN
+		   occured, while URB_SHORT_NOT_OK was set */
+		if (urb && urb->status != -EINPROGRESS
+		    && ep->nextpid != USB_PID_ACK)
+			finish_request(isp116x, ep, urb, regs);
+	}
+	atomic_dec(&isp116x->atl_finishing);
+}
+
+static irqreturn_t isp116x_irq(struct usb_hcd *hcd, struct pt_regs *regs)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	u16 irqstat;
+	irqreturn_t ret = IRQ_NONE;
+
+	spin_lock(&isp116x->lock);
+	isp116x_write_reg16(isp116x, HCuPINTENB, 0);
+	irqstat = isp116x_read_reg16(isp116x, HCuPINT);
+	isp116x_write_reg16(isp116x, HCuPINT, irqstat);
+
+	if (irqstat & (HCuPINT_ATL | HCuPINT_SOF)) {
+		ret = IRQ_HANDLED;
+		finish_atl_transfers(isp116x, regs);
+	}
+
+	if (irqstat & HCuPINT_OPR) {
+		u32 intstat = isp116x_read_reg32(isp116x, HCINTSTAT);
+		isp116x_write_reg32(isp116x, HCINTSTAT, intstat);
+		if (intstat & HCINT_UE) {
+			ERR("Unrecoverable error\n");
+			/* What should we do here? Reset?  */
+		}
+		if (intstat & HCINT_RHSC) {
+			isp116x->rhstatus =
+			    isp116x_read_reg32(isp116x, HCRHSTATUS);
+			isp116x->rhport[0] =
+			    isp116x_read_reg32(isp116x, HCRHPORT1);
+			isp116x->rhport[1] =
+			    isp116x_read_reg32(isp116x, HCRHPORT2);
+		}
+		if (intstat & HCINT_RD) {
+			DBG("---- remote wakeup\n");
+			schedule_work(&isp116x->rh_resume);
+			ret = IRQ_HANDLED;
+		}
+		irqstat &= ~HCuPINT_OPR;
+		ret = IRQ_HANDLED;
+	}
+
+	if (irqstat & (HCuPINT_ATL | HCuPINT_SOF)) {
+		start_atl_transfers(isp116x);
+	}
+
+	isp116x_write_reg16(isp116x, HCuPINTENB, isp116x->irqenb);
+	spin_unlock(&isp116x->lock);
+	return ret;
+}
+
+/*-----------------------------------------------------------------*/
+
+/* usb 1.1 says max 90% of a frame is available for periodic transfers.
+ * this driver doesn't promise that much since it's got to handle an
+ * IRQ per packet; irq handling latencies also use up that time.
+ */
+
+/* out of 1000 us */
+#define	MAX_PERIODIC_LOAD	600
+static int balance(struct isp116x *isp116x, u16 period, u16 load)
+{
+	int i, branch = -ENOSPC;
+
+	/* search for the least loaded schedule branch of that period
+	   which has enough bandwidth left unreserved. */
+	for (i = 0; i < period; i++) {
+		if (branch < 0 || isp116x->load[branch] > isp116x->load[i]) {
+			int j;
+
+			for (j = i; j < PERIODIC_SIZE; j += period) {
+				if ((isp116x->load[j] + load)
+				    > MAX_PERIODIC_LOAD)
+					break;
+			}
+			if (j < PERIODIC_SIZE)
+				continue;
+			branch = i;
+		}
+	}
+	return branch;
+}
+
+/* NB! ALL the code above this point runs with isp116x->lock
+   held, irqs off
+*/
+
+/*-----------------------------------------------------------------*/
+
+static int isp116x_urb_enqueue(struct usb_hcd *hcd,
+			       struct usb_host_endpoint *hep, struct urb *urb,
+			       int mem_flags)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	struct usb_device *udev = urb->dev;
+	unsigned int pipe = urb->pipe;
+	int is_out = !usb_pipein(pipe);
+	int type = usb_pipetype(pipe);
+	int epnum = usb_pipeendpoint(pipe);
+	struct isp116x_ep *ep = NULL;
+	unsigned long flags;
+	int i;
+	int ret = 0;
+
+	urb_dbg(urb, "Enqueue");
+
+	if (type == PIPE_ISOCHRONOUS) {
+		ERR("Isochronous transfers not supported\n");
+		urb_dbg(urb, "Refused to enqueue");
+		return -ENXIO;
+	}
+	/* avoid all allocations within spinlocks: request or endpoint */
+	if (!hep->hcpriv) {
+		ep = kcalloc(1, sizeof *ep, (__force unsigned)mem_flags);
+		if (!ep)
+			return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	if (!HC_IS_RUNNING(hcd->state)) {
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	if (hep->hcpriv)
+		ep = hep->hcpriv;
+	else {
+		INIT_LIST_HEAD(&ep->schedule);
+		ep->udev = usb_get_dev(udev);
+		ep->epnum = epnum;
+		ep->maxpacket = usb_maxpacket(udev, urb->pipe, is_out);
+		usb_settoggle(udev, epnum, is_out, 0);
+
+		if (type == PIPE_CONTROL) {
+			ep->nextpid = USB_PID_SETUP;
+		} else if (is_out) {
+			ep->nextpid = USB_PID_OUT;
+		} else {
+			ep->nextpid = USB_PID_IN;
+		}
+
+		if (urb->interval) {
+			/*
+			   With INT URBs submitted, the driver works with SOF
+			   interrupt enabled and ATL interrupt disabled. After
+			   the PTDs are written to fifo ram, the chip starts
+			   fifo processing and usb transfers after the next
+			   SOF and continues until the transfers are finished
+			   (succeeded or failed) or the frame ends. Therefore,
+			   the transfers occur only in every second frame,
+			   while fifo reading/writing and data processing
+			   occur in every other second frame. */
+			if (urb->interval < 2)
+				urb->interval = 2;
+			if (urb->interval > 2 * PERIODIC_SIZE)
+				urb->interval = 2 * PERIODIC_SIZE;
+			ep->period = urb->interval >> 1;
+			ep->branch = PERIODIC_SIZE;
+			ep->load = usb_calc_bus_time(udev->speed,
+						     !is_out,
+						     (type == PIPE_ISOCHRONOUS),
+						     usb_maxpacket(udev, pipe,
+								   is_out)) /
+			    1000;
+		}
+		hep->hcpriv = ep;
+		ep->hep = hep;
+	}
+
+	/* maybe put endpoint into schedule */
+	switch (type) {
+	case PIPE_CONTROL:
+	case PIPE_BULK:
+		if (list_empty(&ep->schedule))
+			list_add_tail(&ep->schedule, &isp116x->async);
+		break;
+	case PIPE_INTERRUPT:
+		urb->interval = ep->period;
+		ep->length = min((int)ep->maxpacket,
+				 urb->transfer_buffer_length);
+
+		/* urb submitted for already existing endpoint */
+		if (ep->branch < PERIODIC_SIZE)
+			break;
+
+		ret = ep->branch = balance(isp116x, ep->period, ep->load);
+		if (ret < 0)
+			goto fail;
+		ret = 0;
+
+		urb->start_frame = (isp116x->fmindex & (PERIODIC_SIZE - 1))
+		    + ep->branch;
+
+		/* sort each schedule branch by period (slow before fast)
+		   to share the faster parts of the tree without needing
+		   dummy/placeholder nodes */
+		DBG("schedule qh%d/%p branch %d\n", ep->period, ep, ep->branch);
+		for (i = ep->branch; i < PERIODIC_SIZE; i += ep->period) {
+			struct isp116x_ep **prev = &isp116x->periodic[i];
+			struct isp116x_ep *here = *prev;
+
+			while (here && ep != here) {
+				if (ep->period > here->period)
+					break;
+				prev = &here->next;
+				here = *prev;
+			}
+			if (ep != here) {
+				ep->next = here;
+				*prev = ep;
+			}
+			isp116x->load[i] += ep->load;
+		}
+		hcd->self.bandwidth_allocated += ep->load / ep->period;
+
+		/* switch over to SOFint */
+		if (!isp116x->periodic_count++) {
+			isp116x->irqenb &= ~HCuPINT_ATL;
+			isp116x->irqenb |= HCuPINT_SOF;
+			isp116x_write_reg16(isp116x, HCuPINTENB,
+					    isp116x->irqenb);
+		}
+	}
+
+	/* in case of unlink-during-submit */
+	spin_lock(&urb->lock);
+	if (urb->status != -EINPROGRESS) {
+		spin_unlock(&urb->lock);
+		finish_request(isp116x, ep, urb, NULL);
+		ret = 0;
+		goto fail;
+	}
+	urb->hcpriv = hep;
+	spin_unlock(&urb->lock);
+	start_atl_transfers(isp116x);
+
+      fail:
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return ret;
+}
+
+/*
+   Dequeue URBs.
+*/
+static int isp116x_urb_dequeue(struct usb_hcd *hcd, struct urb *urb)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	struct usb_host_endpoint *hep;
+	struct isp116x_ep *ep, *ep_act;
+	unsigned long flags;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	hep = urb->hcpriv;
+	/* URB already unlinked (or never linked)? */
+	if (!hep) {
+		spin_unlock_irqrestore(&isp116x->lock, flags);
+		return 0;
+	}
+	ep = hep->hcpriv;
+	WARN_ON(hep != ep->hep);
+
+	/* In front of queue? */
+	if (ep->hep->urb_list.next == &urb->urb_list)
+		/* active? */
+		for (ep_act = isp116x->atl_active; ep_act;
+		     ep_act = ep_act->active)
+			if (ep_act == ep) {
+				VDBG("dequeue, urb %p active; wait for irq\n",
+				     urb);
+				urb = NULL;
+				break;
+			}
+
+	if (urb)
+		finish_request(isp116x, ep, urb, NULL);
+
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return 0;
+}
+
+static void isp116x_endpoint_disable(struct usb_hcd *hcd,
+				     struct usb_host_endpoint *hep)
+{
+	int i;
+	struct isp116x_ep *ep = hep->hcpriv;;
+
+	if (!ep)
+		return;
+
+	/* assume we'd just wait for the irq */
+	for (i = 0; i < 100 && !list_empty(&hep->urb_list); i++)
+		msleep(3);
+	if (!list_empty(&hep->urb_list))
+		WARN("ep %p not empty?\n", ep);
+
+	usb_put_dev(ep->udev);
+	kfree(ep);
+	hep->hcpriv = NULL;
+}
+
+static int isp116x_get_frame(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	u32 fmnum;
+	unsigned long flags;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	fmnum = isp116x_read_reg32(isp116x, HCFMNUM);
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return (int)fmnum;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+  Adapted from ohci-hub.c. Currently we don't support autosuspend.
+*/
+static int isp116x_hub_status_data(struct usb_hcd *hcd, char *buf)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	int ports, i, changed = 0;
+
+	if (!HC_IS_RUNNING(hcd->state))
+		return -ESHUTDOWN;
+
+	ports = isp116x->rhdesca & RH_A_NDP;
+
+	/* init status */
+	if (isp116x->rhstatus & (RH_HS_LPSC | RH_HS_OCIC))
+		buf[0] = changed = 1;
+	else
+		buf[0] = 0;
+
+	for (i = 0; i < ports; i++) {
+		u32 status = isp116x->rhport[i];
+
+		if (status & (RH_PS_CSC | RH_PS_PESC | RH_PS_PSSC
+			      | RH_PS_OCIC | RH_PS_PRSC)) {
+			changed = 1;
+			buf[0] |= 1 << (i + 1);
+			continue;
+		}
+	}
+	return changed;
+}
+
+static void isp116x_hub_descriptor(struct isp116x *isp116x,
+				   struct usb_hub_descriptor *desc)
+{
+	u32 reg = isp116x->rhdesca;
+
+	desc->bDescriptorType = 0x29;
+	desc->bDescLength = 9;
+	desc->bHubContrCurrent = 0;
+	desc->bNbrPorts = (u8) (reg & 0x3);
+	/* Power switching, device type, overcurrent. */
+	desc->wHubCharacteristics =
+	    (__force __u16) cpu_to_le16((u16) ((reg >> 8) & 0x1f));
+	desc->bPwrOn2PwrGood = (u8) ((reg >> 24) & 0xff);
+	/* two bitmaps:  ports removable, and legacy PortPwrCtrlMask */
+	desc->bitmap[0] = desc->bNbrPorts == 1 ? 1 << 1 : 3 << 1;
+	desc->bitmap[1] = ~0;
+}
+
+/* Perform reset of a given port.
+   It would be great to just start the reset and let the
+   USB core to clear the reset in due time. However,
+   root hub ports should be reset for at least 50 ms, while
+   our chip stays in reset for about 10 ms. I.e., we must
+   repeatedly reset it ourself here.
+*/
+static inline void root_port_reset(struct isp116x *isp116x, unsigned port)
+{
+	u32 tmp;
+	unsigned long flags, t;
+
+	/* Root hub reset should be 50 ms, but some devices
+	   want it even longer. */
+	t = jiffies + msecs_to_jiffies(100);
+
+	while (time_before(jiffies, t)) {
+		spin_lock_irqsave(&isp116x->lock, flags);
+		/* spin until any current reset finishes */
+		for (;;) {
+			tmp = isp116x_read_reg32(isp116x, port ?
+						 HCRHPORT2 : HCRHPORT1);
+			if (!(tmp & RH_PS_PRS))
+				break;
+			udelay(500);
+		}
+		/* Don't reset a disconnected port */
+		if (!(tmp & RH_PS_CCS)) {
+			spin_unlock_irqrestore(&isp116x->lock, flags);
+			break;
+		}
+		/* Reset lasts 10ms (claims datasheet) */
+		isp116x_write_reg32(isp116x, port ? HCRHPORT2 :
+				    HCRHPORT1, (RH_PS_PRS));
+		spin_unlock_irqrestore(&isp116x->lock, flags);
+		msleep(10);
+	}
+}
+
+/* Adapted from ohci-hub.c */
+static int isp116x_hub_control(struct usb_hcd *hcd,
+			       u16 typeReq,
+			       u16 wValue, u16 wIndex, char *buf, u16 wLength)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	int ret = 0;
+	unsigned long flags;
+	int ports = isp116x->rhdesca & RH_A_NDP;
+	u32 tmp = 0;
+
+	switch (typeReq) {
+	case ClearHubFeature:
+		DBG("ClearHubFeature: ");
+		switch (wValue) {
+		case C_HUB_OVER_CURRENT:
+			DBG("C_HUB_OVER_CURRENT\n");
+			spin_lock_irqsave(&isp116x->lock, flags);
+			isp116x_write_reg32(isp116x, HCRHSTATUS, RH_HS_OCIC);
+			spin_unlock_irqrestore(&isp116x->lock, flags);
+		case C_HUB_LOCAL_POWER:
+			DBG("C_HUB_LOCAL_POWER\n");
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case SetHubFeature:
+		DBG("SetHubFeature: ");
+		switch (wValue) {
+		case C_HUB_OVER_CURRENT:
+		case C_HUB_LOCAL_POWER:
+			DBG("C_HUB_OVER_CURRENT or C_HUB_LOCAL_POWER\n");
+			break;
+		default:
+			goto error;
+		}
+		break;
+	case GetHubDescriptor:
+		DBG("GetHubDescriptor\n");
+		isp116x_hub_descriptor(isp116x,
+				       (struct usb_hub_descriptor *)buf);
+		break;
+	case GetHubStatus:
+		DBG("GetHubStatus\n");
+		*(__le32 *) buf = cpu_to_le32(0);
+		break;
+	case GetPortStatus:
+		DBG("GetPortStatus\n");
+		if (!wIndex || wIndex > ports)
+			goto error;
+		tmp = isp116x->rhport[--wIndex];
+		*(__le32 *) buf = cpu_to_le32(tmp);
+		DBG("GetPortStatus: port[%d]  %08x\n", wIndex + 1, tmp);
+		break;
+	case ClearPortFeature:
+		DBG("ClearPortFeature: ");
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+
+		switch (wValue) {
+		case USB_PORT_FEAT_ENABLE:
+			DBG("USB_PORT_FEAT_ENABLE\n");
+			tmp = RH_PS_CCS;
+			break;
+		case USB_PORT_FEAT_C_ENABLE:
+			DBG("USB_PORT_FEAT_C_ENABLE\n");
+			tmp = RH_PS_PESC;
+			break;
+		case USB_PORT_FEAT_SUSPEND:
+			DBG("USB_PORT_FEAT_SUSPEND\n");
+			tmp = RH_PS_POCI;
+			break;
+		case USB_PORT_FEAT_C_SUSPEND:
+			DBG("USB_PORT_FEAT_C_SUSPEND\n");
+			tmp = RH_PS_PSSC;
+			break;
+		case USB_PORT_FEAT_POWER:
+			DBG("USB_PORT_FEAT_POWER\n");
+			tmp = RH_PS_LSDA;
+			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			DBG("USB_PORT_FEAT_C_CONNECTION\n");
+			tmp = RH_PS_CSC;
+			break;
+		case USB_PORT_FEAT_C_OVER_CURRENT:
+			DBG("USB_PORT_FEAT_C_OVER_CURRENT\n");
+			tmp = RH_PS_OCIC;
+			break;
+		case USB_PORT_FEAT_C_RESET:
+			DBG("USB_PORT_FEAT_C_RESET\n");
+			tmp = RH_PS_PRSC;
+			break;
+		default:
+			goto error;
+		}
+		spin_lock_irqsave(&isp116x->lock, flags);
+		isp116x_write_reg32(isp116x, wIndex
+				    ? HCRHPORT2 : HCRHPORT1, tmp);
+		isp116x->rhport[wIndex] =
+		    isp116x_read_reg32(isp116x, wIndex ? HCRHPORT2 : HCRHPORT1);
+		spin_unlock_irqrestore(&isp116x->lock, flags);
+		break;
+	case SetPortFeature:
+		DBG("SetPortFeature: ");
+		if (!wIndex || wIndex > ports)
+			goto error;
+		wIndex--;
+		switch (wValue) {
+		case USB_PORT_FEAT_SUSPEND:
+			DBG("USB_PORT_FEAT_SUSPEND\n");
+			spin_lock_irqsave(&isp116x->lock, flags);
+			isp116x_write_reg32(isp116x, wIndex
+					    ? HCRHPORT2 : HCRHPORT1, RH_PS_PSS);
+			break;
+		case USB_PORT_FEAT_POWER:
+			DBG("USB_PORT_FEAT_POWER\n");
+			spin_lock_irqsave(&isp116x->lock, flags);
+			isp116x_write_reg32(isp116x, wIndex
+					    ? HCRHPORT2 : HCRHPORT1, RH_PS_PPS);
+			break;
+		case USB_PORT_FEAT_RESET:
+			DBG("USB_PORT_FEAT_RESET\n");
+			root_port_reset(isp116x, wIndex);
+			spin_lock_irqsave(&isp116x->lock, flags);
+			break;
+		default:
+			goto error;
+		}
+		isp116x->rhport[wIndex] =
+		    isp116x_read_reg32(isp116x, wIndex ? HCRHPORT2 : HCRHPORT1);
+		spin_unlock_irqrestore(&isp116x->lock, flags);
+		break;
+
+	default:
+	      error:
+		/* "protocol stall" on error */
+		DBG("PROTOCOL STALL\n");
+		ret = -EPIPE;
+	}
+	return ret;
+}
+
+#ifdef	CONFIG_PM
+
+static int isp116x_hub_suspend(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	unsigned long flags;
+	u32 val;
+	int ret = 0;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+
+	val = isp116x_read_reg32(isp116x, HCCONTROL);
+	switch (val & HCCONTROL_HCFS) {
+	case HCCONTROL_USB_OPER:
+		hcd->state = HC_STATE_QUIESCING;
+		val &= (~HCCONTROL_HCFS & ~HCCONTROL_RWE);
+		val |= HCCONTROL_USB_SUSPEND;
+		if (hcd->remote_wakeup)
+			val |= HCCONTROL_RWE;
+		/* Wait for usb transfers to finish */
+		mdelay(2);
+		isp116x_write_reg32(isp116x, HCCONTROL, val);
+		hcd->state = HC_STATE_SUSPENDED;
+		/* Wait for devices to suspend */
+		mdelay(5);
+	case HCCONTROL_USB_SUSPEND:
+		break;
+	case HCCONTROL_USB_RESUME:
+		isp116x_write_reg32(isp116x, HCCONTROL,
+				    (val & ~HCCONTROL_HCFS) |
+				    HCCONTROL_USB_RESET);
+	case HCCONTROL_USB_RESET:
+		ret = -EBUSY;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return ret;
+}
+
+static int isp116x_hub_resume(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	u32 val;
+	int ret = -EINPROGRESS;
+
+	msleep(5);
+	spin_lock_irq(&isp116x->lock);
+
+	val = isp116x_read_reg32(isp116x, HCCONTROL);
+	switch (val & HCCONTROL_HCFS) {
+	case HCCONTROL_USB_SUSPEND:
+		val &= ~HCCONTROL_HCFS;
+		val |= HCCONTROL_USB_RESUME;
+		isp116x_write_reg32(isp116x, HCCONTROL, val);
+	case HCCONTROL_USB_RESUME:
+		break;
+	case HCCONTROL_USB_OPER:
+		/* Without setting power_state here the
+		   SUSPENDED state won't be removed from
+		   sysfs/usbN/power.state as a response to remote
+		   wakeup. Maybe in the future. */
+		hcd->self.root_hub->dev.power.power_state = PMSG_ON;
+		ret = 0;
+		break;
+	default:
+		ret = -EBUSY;
+	}
+
+	if (ret != -EINPROGRESS) {
+		spin_unlock_irq(&isp116x->lock);
+		return ret;
+	}
+
+	val = isp116x->rhdesca & RH_A_NDP;
+	while (val--) {
+		u32 stat =
+		    isp116x_read_reg32(isp116x, val ? HCRHPORT2 : HCRHPORT1);
+		/* force global, not selective, resume */
+		if (!(stat & RH_PS_PSS))
+			continue;
+		DBG("%s: Resuming port %d\n", __func__, val);
+		isp116x_write_reg32(isp116x, RH_PS_POCI, val
+				    ? HCRHPORT2 : HCRHPORT1);
+	}
+	spin_unlock_irq(&isp116x->lock);
+
+	hcd->state = HC_STATE_RESUMING;
+	mdelay(20);
+
+	/* Go operational */
+	spin_lock_irq(&isp116x->lock);
+	val = isp116x_read_reg32(isp116x, HCCONTROL);
+	isp116x_write_reg32(isp116x, HCCONTROL,
+			    (val & ~HCCONTROL_HCFS) | HCCONTROL_USB_OPER);
+	spin_unlock_irq(&isp116x->lock);
+	/* see analogous comment above */
+	hcd->self.root_hub->dev.power.power_state = PMSG_ON;
+	hcd->state = HC_STATE_RUNNING;
+
+	return 0;
+}
+
+static void isp116x_rh_resume(void *_hcd)
+{
+	struct usb_hcd *hcd = _hcd;
+
+	usb_resume_device(hcd->self.root_hub);
+}
+
+#else
+
+#define	isp116x_hub_suspend	NULL
+#define	isp116x_hub_resume	NULL
+
+static void isp116x_rh_resume(void *_hcd)
+{
+}
+
+#endif
+
+/*-----------------------------------------------------------------*/
+
+#ifdef STUB_DEBUG_FILE
+
+static inline void create_debug_file(struct isp116x *isp116x)
+{
+}
+
+static inline void remove_debug_file(struct isp116x *isp116x)
+{
+}
+
+#else
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+static void dump_irq(struct seq_file *s, char *label, u16 mask)
+{
+	seq_printf(s, "%s %04x%s%s%s%s%s%s\n", label, mask,
+		   mask & HCuPINT_CLKRDY ? " clkrdy" : "",
+		   mask & HCuPINT_SUSP ? " susp" : "",
+		   mask & HCuPINT_OPR ? " opr" : "",
+		   mask & HCuPINT_AIIEOT ? " eot" : "",
+		   mask & HCuPINT_ATL ? " atl" : "",
+		   mask & HCuPINT_SOF ? " sof" : "");
+}
+
+static void dump_int(struct seq_file *s, char *label, u32 mask)
+{
+	seq_printf(s, "%s %08x%s%s%s%s%s%s%s\n", label, mask,
+		   mask & HCINT_MIE ? " MIE" : "",
+		   mask & HCINT_RHSC ? " rhsc" : "",
+		   mask & HCINT_FNO ? " fno" : "",
+		   mask & HCINT_UE ? " ue" : "",
+		   mask & HCINT_RD ? " rd" : "",
+		   mask & HCINT_SF ? " sof" : "", mask & HCINT_SO ? " so" : "");
+}
+
+static int proc_isp116x_show(struct seq_file *s, void *unused)
+{
+	struct isp116x *isp116x = s->private;
+	struct isp116x_ep *ep;
+	struct urb *urb;
+	unsigned i;
+	char *str;
+
+	seq_printf(s, "%s\n%s version %s\n",
+		   isp116x_to_hcd(isp116x)->product_desc, hcd_name,
+		   DRIVER_VERSION);
+
+	if (HC_IS_SUSPENDED(isp116x_to_hcd(isp116x)->state)) {
+		seq_printf(s, "HCD is suspended\n");
+		return 0;
+	}
+	if (!HC_IS_RUNNING(isp116x_to_hcd(isp116x)->state)) {
+		seq_printf(s, "HCD not running\n");
+		return 0;
+	}
+
+	spin_lock_irq(&isp116x->lock);
+
+	dump_irq(s, "hc_irq_enable", isp116x_read_reg16(isp116x, HCuPINTENB));
+	dump_irq(s, "hc_irq_status", isp116x_read_reg16(isp116x, HCuPINT));
+	dump_int(s, "hc_int_enable", isp116x_read_reg32(isp116x, HCINTENB));
+	dump_int(s, "hc_int_status", isp116x_read_reg32(isp116x, HCINTSTAT));
+
+	list_for_each_entry(ep, &isp116x->async, schedule) {
+
+		switch (ep->nextpid) {
+		case USB_PID_IN:
+			str = "in";
+			break;
+		case USB_PID_OUT:
+			str = "out";
+			break;
+		case USB_PID_SETUP:
+			str = "setup";
+			break;
+		case USB_PID_ACK:
+			str = "status";
+			break;
+		default:
+			str = "?";
+			break;
+		};
+		seq_printf(s, "%p, ep%d%s, maxpacket %d:\n", ep,
+			   ep->epnum, str, ep->maxpacket);
+		list_for_each_entry(urb, &ep->hep->urb_list, urb_list) {
+			seq_printf(s, "  urb%p, %d/%d\n", urb,
+				   urb->actual_length,
+				   urb->transfer_buffer_length);
+		}
+	}
+	if (!list_empty(&isp116x->async))
+		seq_printf(s, "\n");
+
+	seq_printf(s, "periodic size= %d\n", PERIODIC_SIZE);
+
+	for (i = 0; i < PERIODIC_SIZE; i++) {
+		ep = isp116x->periodic[i];
+		if (!ep)
+			continue;
+		seq_printf(s, "%2d [%3d]:\n", i, isp116x->load[i]);
+
+		/* DUMB: prints shared entries multiple times */
+		do {
+			seq_printf(s, "   %d/%p (%sdev%d ep%d%s max %d)\n",
+				   ep->period, ep,
+				   (ep->udev->speed ==
+				    USB_SPEED_FULL) ? "" : "ls ",
+				   ep->udev->devnum, ep->epnum,
+				   (ep->epnum ==
+				    0) ? "" : ((ep->nextpid ==
+						USB_PID_IN) ? "in" : "out"),
+				   ep->maxpacket);
+			ep = ep->next;
+		} while (ep);
+	}
+	spin_unlock_irq(&isp116x->lock);
+	seq_printf(s, "\n");
+
+	return 0;
+}
+
+static int proc_isp116x_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, proc_isp116x_show, PDE(inode)->data);
+}
+
+static struct file_operations proc_ops = {
+	.open = proc_isp116x_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/* expect just one isp116x per system */
+static const char proc_filename[] = "driver/isp116x";
+
+static void create_debug_file(struct isp116x *isp116x)
+{
+	struct proc_dir_entry *pde;
+
+	pde = create_proc_entry(proc_filename, 0, NULL);
+	if (pde == NULL)
+		return;
+
+	pde->proc_fops = &proc_ops;
+	pde->data = isp116x;
+	isp116x->pde = pde;
+}
+
+static void remove_debug_file(struct isp116x *isp116x)
+{
+	if (isp116x->pde)
+		remove_proc_entry(proc_filename, NULL);
+}
+
+#endif
+
+/*-----------------------------------------------------------------*/
+
+/*
+  Software reset - can be called from any contect.
+*/
+static int isp116x_sw_reset(struct isp116x *isp116x)
+{
+	int retries = 15;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	isp116x_write_reg16(isp116x, HCSWRES, HCSWRES_MAGIC);
+	isp116x_write_reg32(isp116x, HCCMDSTAT, HCCMDSTAT_HCR);
+	while (--retries) {
+		/* It usually resets within 1 ms */
+		mdelay(1);
+		if (!(isp116x_read_reg32(isp116x, HCCMDSTAT) & HCCMDSTAT_HCR))
+			break;
+	}
+	if (!retries) {
+		ERR("Software reset timeout\n");
+		ret = -ETIME;
+	}
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return ret;
+}
+
+/*
+  Reset. Tries to perform platform-specific hardware
+  reset first; falls back to software reset.
+*/
+static int isp116x_reset(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	unsigned long t;
+	u16 clkrdy = 0;
+	int ret = 0, timeout = 15 /* ms */ ;
+
+	if (isp116x->board && isp116x->board->reset) {
+		/* Hardware reset */
+		isp116x->board->reset(hcd->self.controller, 1);
+		msleep(10);
+		if (isp116x->board->clock)
+			isp116x->board->clock(hcd->self.controller, 1);
+		msleep(1);
+		isp116x->board->reset(hcd->self.controller, 0);
+	} else
+		ret = isp116x_sw_reset(isp116x);
+
+	if (ret)
+		return ret;
+
+	t = jiffies + msecs_to_jiffies(timeout);
+	while (time_before_eq(jiffies, t)) {
+		msleep(4);
+		spin_lock_irq(&isp116x->lock);
+		clkrdy = isp116x_read_reg16(isp116x, HCuPINT) & HCuPINT_CLKRDY;
+		spin_unlock_irq(&isp116x->lock);
+		if (clkrdy)
+			break;
+	}
+	if (!clkrdy) {
+		ERR("Clock not ready after 20ms\n");
+		ret = -ENODEV;
+	}
+	return ret;
+}
+
+static void isp116x_stop(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	unsigned long flags;
+	u32 val;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	isp116x_write_reg16(isp116x, HCuPINTENB, 0);
+
+	/* Switch off ports' power, some devices don't come up
+	   after next 'insmod' without this */
+	val = isp116x_read_reg32(isp116x, HCRHDESCA);
+	val &= ~(RH_A_NPS | RH_A_PSM);
+	isp116x_write_reg32(isp116x, HCRHDESCA, val);
+	isp116x_write_reg32(isp116x, HCRHSTATUS, RH_HS_LPS);
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+
+	/* Put the chip into reset state */
+	if (isp116x->board && isp116x->board->reset)
+		isp116x->board->reset(hcd->self.controller, 0);
+	else
+		isp116x_sw_reset(isp116x);
+
+	/* Stop the clock */
+	if (isp116x->board && isp116x->board->clock)
+		isp116x->board->clock(hcd->self.controller, 0);
+}
+
+/*
+  Configure the chip. The chip must be successfully reset by now.
+*/
+static int isp116x_start(struct usb_hcd *hcd)
+{
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	struct isp116x_platform_data *board = isp116x->board;
+	struct usb_device *udev;
+	u32 val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+
+	/* clear interrupt status and disable all interrupt sources */
+	isp116x_write_reg16(isp116x, HCuPINT, 0xff);
+	isp116x_write_reg16(isp116x, HCuPINTENB, 0);
+
+	val = isp116x_read_reg16(isp116x, HCCHIPID);
+	if ((val & HCCHIPID_MASK) != HCCHIPID_MAGIC) {
+		ERR("Invalid chip ID %04x\n", val);
+		spin_unlock_irqrestore(&isp116x->lock, flags);
+		return -ENODEV;
+	}
+
+	isp116x_write_reg16(isp116x, HCITLBUFLEN, ISP116x_ITL_BUFSIZE);
+	isp116x_write_reg16(isp116x, HCATLBUFLEN, ISP116x_ATL_BUFSIZE);
+
+	/* ----- HW conf */
+	val = HCHWCFG_INT_ENABLE | HCHWCFG_DBWIDTH(1);
+	if (board->sel15Kres)
+		val |= HCHWCFG_15KRSEL;
+	/* Remote wakeup won't work without working clock */
+	if (board->clknotstop || board->remote_wakeup_enable)
+		val |= HCHWCFG_CLKNOTSTOP;
+	if (board->oc_enable)
+		val |= HCHWCFG_ANALOG_OC;
+	if (board->int_act_high)
+		val |= HCHWCFG_INT_POL;
+	if (board->int_edge_triggered)
+		val |= HCHWCFG_INT_TRIGGER;
+	isp116x_write_reg16(isp116x, HCHWCFG, val);
+
+	/* ----- Root hub conf */
+	val = 0;
+	/* AN10003_1.pdf recommends NPS to be always 1 */
+	if (board->no_power_switching)
+		val |= RH_A_NPS;
+	if (board->power_switching_mode)
+		val |= RH_A_PSM;
+	if (board->potpg)
+		val |= (board->potpg << 24) & RH_A_POTPGT;
+	else
+		val |= (25 << 24) & RH_A_POTPGT;
+	isp116x_write_reg32(isp116x, HCRHDESCA, val);
+	isp116x->rhdesca = isp116x_read_reg32(isp116x, HCRHDESCA);
+
+	val = RH_B_PPCM;
+	isp116x_write_reg32(isp116x, HCRHDESCB, val);
+	isp116x->rhdescb = isp116x_read_reg32(isp116x, HCRHDESCB);
+
+	val = 0;
+	if (board->remote_wakeup_enable) {
+		hcd->can_wakeup = 1;
+		val |= RH_HS_DRWE;
+	}
+	isp116x_write_reg32(isp116x, HCRHSTATUS, val);
+	isp116x->rhstatus = isp116x_read_reg32(isp116x, HCRHSTATUS);
+
+	isp116x_write_reg32(isp116x, HCFMINTVL, 0x27782edf);
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+
+	udev = usb_alloc_dev(NULL, &hcd->self, 0);
+	if (!udev) {
+		isp116x_stop(hcd);
+		return -ENOMEM;
+	}
+
+	udev->speed = USB_SPEED_FULL;
+	hcd->state = HC_STATE_RUNNING;
+
+	if (usb_hcd_register_root_hub(udev, hcd) != 0) {
+		isp116x_stop(hcd);
+		usb_put_dev(udev);
+		return -ENODEV;
+	}
+
+	spin_lock_irqsave(&isp116x->lock, flags);
+	/* Set up interrupts */
+	isp116x->intenb = HCINT_MIE | HCINT_RHSC | HCINT_UE;
+	if (board->remote_wakeup_enable)
+		isp116x->intenb |= HCINT_RD;
+	isp116x->irqenb = HCuPINT_ATL | HCuPINT_OPR;	/* | HCuPINT_SUSP; */
+	isp116x_write_reg32(isp116x, HCINTENB, isp116x->intenb);
+	isp116x_write_reg16(isp116x, HCuPINTENB, isp116x->irqenb);
+
+	/* Go operational */
+	val = HCCONTROL_USB_OPER;
+	/* Remote wakeup connected - NOT SUPPORTED */
+	/*  if (board->remote_wakeup_connected)
+	   val |= HCCONTROL_RWC;  */
+	if (board->remote_wakeup_enable)
+		val |= HCCONTROL_RWE;
+	isp116x_write_reg32(isp116x, HCCONTROL, val);
+
+	/* Disable ports to avoid race in device enumeration */
+	isp116x_write_reg32(isp116x, HCRHPORT1, RH_PS_CCS);
+	isp116x_write_reg32(isp116x, HCRHPORT2, RH_PS_CCS);
+
+	isp116x_show_regs(isp116x);
+	spin_unlock_irqrestore(&isp116x->lock, flags);
+	return 0;
+}
+
+/*-----------------------------------------------------------------*/
+
+static struct hc_driver isp116x_hc_driver = {
+	.description = hcd_name,
+	.product_desc = "ISP116x Host Controller",
+	.hcd_priv_size = sizeof(struct isp116x),
+
+	.irq = isp116x_irq,
+	.flags = HCD_USB11,
+
+	.reset = isp116x_reset,
+	.start = isp116x_start,
+	.stop = isp116x_stop,
+
+	.urb_enqueue = isp116x_urb_enqueue,
+	.urb_dequeue = isp116x_urb_dequeue,
+	.endpoint_disable = isp116x_endpoint_disable,
+
+	.get_frame_number = isp116x_get_frame,
+
+	.hub_status_data = isp116x_hub_status_data,
+	.hub_control = isp116x_hub_control,
+	.hub_suspend = isp116x_hub_suspend,
+	.hub_resume = isp116x_hub_resume,
+};
+
+/*----------------------------------------------------------------*/
+
+static int __init_or_module isp116x_remove(struct device *dev)
+{
+	struct usb_hcd *hcd = dev_get_drvdata(dev);
+	struct isp116x *isp116x = hcd_to_isp116x(hcd);
+	struct platform_device *pdev;
+	struct resource *res;
+
+	pdev = container_of(dev, struct platform_device, dev);
+	remove_debug_file(isp116x);
+	usb_remove_hcd(hcd);
+
+	iounmap(isp116x->data_reg);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	release_mem_region(res->start, 2);
+	iounmap(isp116x->addr_reg);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	release_mem_region(res->start, 2);
+
+	usb_put_hcd(hcd);
+	return 0;
+}
+
+#define resource_len(r) (((r)->end - (r)->start) + 1)
+
+static int __init isp116x_probe(struct device *dev)
+{
+	struct usb_hcd *hcd;
+	struct isp116x *isp116x;
+	struct platform_device *pdev;
+	struct resource *addr, *data;
+	void __iomem *addr_reg;
+	void __iomem *data_reg;
+	int irq;
+	int ret = 0;
+
+	pdev = container_of(dev, struct platform_device, dev);
+	if (pdev->num_resources < 3) {
+		ret = -ENODEV;
+		goto err1;
+	}
+
+	data = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	addr = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	irq = platform_get_irq(pdev, 0);
+	if (!addr || !data || irq < 0) {
+		ret = -ENODEV;
+		goto err1;
+	}
+
+	if (dev->dma_mask) {
+		DBG("DMA not supported\n");
+		ret = -EINVAL;
+		goto err1;
+	}
+
+	if (!request_mem_region(addr->start, 2, hcd_name)) {
+		ret = -EBUSY;
+		goto err1;
+	}
+	addr_reg = ioremap(addr->start, resource_len(addr));
+	if (addr_reg == NULL) {
+		ret = -ENOMEM;
+		goto err2;
+	}
+	if (!request_mem_region(data->start, 2, hcd_name)) {
+		ret = -EBUSY;
+		goto err3;
+	}
+	data_reg = ioremap(data->start, resource_len(data));
+	if (data_reg == NULL) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+
+	/* allocate and initialize hcd */
+	hcd = usb_create_hcd(&isp116x_hc_driver, dev, dev->bus_id);
+	if (!hcd) {
+		ret = -ENOMEM;
+		goto err5;
+	}
+	/* this rsrc_start is bogus */
+	hcd->rsrc_start = addr->start;
+	isp116x = hcd_to_isp116x(hcd);
+	isp116x->data_reg = data_reg;
+	isp116x->addr_reg = addr_reg;
+	spin_lock_init(&isp116x->lock);
+	INIT_LIST_HEAD(&isp116x->async);
+	INIT_WORK(&isp116x->rh_resume, isp116x_rh_resume, hcd);
+	isp116x->board = dev->platform_data;
+
+	if (!isp116x->board) {
+		ERR("Platform data structure not initialized\n");
+		ret = -ENODEV;
+		goto err6;
+	}
+	if (isp116x_check_platform_delay(isp116x)) {
+		ERR("USE_PLATFORM_DELAY defined, but delay function not "
+		    "implemented.\n");
+		ERR("See comments in drivers/usb/host/isp116x-hcd.c\n");
+		ret = -ENODEV;
+		goto err6;
+	}
+
+	ret = usb_add_hcd(hcd, irq, SA_INTERRUPT);
+	if (ret != 0)
+		goto err6;
+
+	create_debug_file(isp116x);
+	return 0;
+
+      err6:
+	usb_put_hcd(hcd);
+      err5:
+	iounmap(data_reg);
+      err4:
+	release_mem_region(data->start, 2);
+      err3:
+	iounmap(addr_reg);
+      err2:
+	release_mem_region(addr->start, 2);
+      err1:
+	ERR("init error, %d\n", ret);
+	return ret;
+}
+
+#ifdef	CONFIG_PM
+/*
+  Suspend of platform device
+*/
+static int isp116x_suspend(struct device *dev, pm_message_t state, u32 phase)
+{
+	int ret = 0;
+	struct usb_hcd *hcd = dev_get_drvdata(dev);
+
+	VDBG("%s: state %x, phase %x\n", __func__, state, phase);
+
+	if (phase != SUSPEND_DISABLE && phase != SUSPEND_POWER_DOWN)
+		return 0;
+
+	ret = usb_suspend_device(hcd->self.root_hub, state);
+	if (!ret) {
+		dev->power.power_state = state;
+		INFO("%s suspended\n", (char *)hcd_name);
+	} else
+		ERR("%s suspend failed\n", (char *)hcd_name);
+
+	return ret;
+}
+
+/*
+  Resume platform device
+*/
+static int isp116x_resume(struct device *dev, u32 phase)
+{
+	int ret = 0;
+	struct usb_hcd *hcd = dev_get_drvdata(dev);
+
+	VDBG("%s:  state %x, phase %x\n", __func__, dev->power.power_state,
+	     phase);
+	if (phase != RESUME_POWER_ON)
+		return 0;
+
+	ret = usb_resume_device(hcd->self.root_hub);
+	if (!ret) {
+		dev->power.power_state = PMSG_ON;
+		VDBG("%s resumed\n", (char *)hcd_name);
+	}
+	return ret;
+}
+
+#else
+
+#define	isp116x_suspend    NULL
+#define	isp116x_resume     NULL
+
+#endif
+
+static struct device_driver isp116x_driver = {
+	.name = (char *)hcd_name,
+	.bus = &platform_bus_type,
+	.probe = isp116x_probe,
+	.remove = isp116x_remove,
+	.suspend = isp116x_suspend,
+	.resume = isp116x_resume,
+};
+
+/*-----------------------------------------------------------------*/
+
+static int __init isp116x_init(void)
+{
+	if (usb_disabled())
+		return -ENODEV;
+
+	INFO("driver %s, %s\n", hcd_name, DRIVER_VERSION);
+	return driver_register(&isp116x_driver);
+}
+
+module_init(isp116x_init);
+
+static void __exit isp116x_cleanup(void)
+{
+	driver_unregister(&isp116x_driver);
+}
+
+module_exit(isp116x_cleanup);
diff --git a/drivers/usb/host/isp116x.h b/drivers/usb/host/isp116x.h
new file mode 100644
index 000000000000..58873470dcf5
--- /dev/null
+++ b/drivers/usb/host/isp116x.h
@@ -0,0 +1,583 @@
+/*
+ * ISP116x register declarations and HCD data structures
+ *
+ * Copyright (C) 2005 Olav Kongas <ok@artecdesign.ee>
+ * Portions:
+ * Copyright (C) 2004 Lothar Wassmann
+ * Copyright (C) 2004 Psion Teklogix
+ * Copyright (C) 2004 David Brownell
+ */
+
+/* us of 1ms frame */
+#define  MAX_LOAD_LIMIT		850
+
+/* Full speed: max # of bytes to transfer for a single urb
+   at a time must be < 1024 && must be multiple of 64.
+   832 allows transfering 4kiB within 5 frames. */
+#define MAX_TRANSFER_SIZE_FULLSPEED	832
+
+/* Low speed: there is no reason to schedule in very big
+   chunks; often the requested long transfers are for
+   string descriptors containing short strings. */
+#define MAX_TRANSFER_SIZE_LOWSPEED	64
+
+/* Bytetime (us), a rough indication of how much time it
+   would take to transfer a byte of useful data over USB */
+#define BYTE_TIME_FULLSPEED	1
+#define BYTE_TIME_LOWSPEED	20
+
+/* Buffer sizes */
+#define ISP116x_BUF_SIZE	4096
+#define ISP116x_ITL_BUFSIZE	0
+#define ISP116x_ATL_BUFSIZE	((ISP116x_BUF_SIZE) - 2*(ISP116x_ITL_BUFSIZE))
+
+#define ISP116x_WRITE_OFFSET	0x80
+
+/*------------ ISP116x registers/bits ------------*/
+#define	HCREVISION	0x00
+#define	HCCONTROL	0x01
+#define		HCCONTROL_HCFS	(3 << 6)	/* host controller
+						   functional state */
+#define		HCCONTROL_USB_RESET	(0 << 6)
+#define		HCCONTROL_USB_RESUME	(1 << 6)
+#define		HCCONTROL_USB_OPER	(2 << 6)
+#define		HCCONTROL_USB_SUSPEND	(3 << 6)
+#define		HCCONTROL_RWC	(1 << 9)	/* remote wakeup connected */
+#define		HCCONTROL_RWE	(1 << 10)	/* remote wakeup enable */
+#define	HCCMDSTAT	0x02
+#define		HCCMDSTAT_HCR	(1 << 0)	/* host controller reset */
+#define		HCCMDSTAT_SOC	(3 << 16)	/* scheduling overrun count */
+#define	HCINTSTAT	0x03
+#define		HCINT_SO	(1 << 0)	/* scheduling overrun */
+#define		HCINT_WDH	(1 << 1)	/* writeback of done_head */
+#define		HCINT_SF	(1 << 2)	/* start frame */
+#define		HCINT_RD	(1 << 3)	/* resume detect */
+#define		HCINT_UE	(1 << 4)	/* unrecoverable error */
+#define		HCINT_FNO	(1 << 5)	/* frame number overflow */
+#define		HCINT_RHSC	(1 << 6)	/* root hub status change */
+#define		HCINT_OC	(1 << 30)	/* ownership change */
+#define		HCINT_MIE	(1 << 31)	/* master interrupt enable */
+#define	HCINTENB	0x04
+#define	HCINTDIS	0x05
+#define	HCFMINTVL	0x0d
+#define	HCFMREM		0x0e
+#define	HCFMNUM		0x0f
+#define	HCLSTHRESH	0x11
+#define	HCRHDESCA	0x12
+#define		RH_A_NDP	(0x3 << 0)	/* # downstream ports */
+#define		RH_A_PSM	(1 << 8)	/* power switching mode */
+#define		RH_A_NPS	(1 << 9)	/* no power switching */
+#define		RH_A_DT		(1 << 10)	/* device type (mbz) */
+#define		RH_A_OCPM	(1 << 11)	/* overcurrent protection
+						   mode */
+#define		RH_A_NOCP	(1 << 12)	/* no overcurrent protection */
+#define		RH_A_POTPGT	(0xff << 24)	/* power on -> power good
+						   time */
+#define	HCRHDESCB	0x13
+#define		RH_B_DR		(0xffff << 0)	/* device removable flags */
+#define		RH_B_PPCM	(0xffff << 16)	/* port power control mask */
+#define	HCRHSTATUS	0x14
+#define		RH_HS_LPS	(1 << 0)	/* local power status */
+#define		RH_HS_OCI	(1 << 1)	/* over current indicator */
+#define		RH_HS_DRWE	(1 << 15)	/* device remote wakeup
+						   enable */
+#define		RH_HS_LPSC	(1 << 16)	/* local power status change */
+#define		RH_HS_OCIC	(1 << 17)	/* over current indicator
+						   change */
+#define		RH_HS_CRWE	(1 << 31)	/* clear remote wakeup
+						   enable */
+#define	HCRHPORT1	0x15
+#define		RH_PS_CCS	(1 << 0)	/* current connect status */
+#define		RH_PS_PES	(1 << 1)	/* port enable status */
+#define		RH_PS_PSS	(1 << 2)	/* port suspend status */
+#define		RH_PS_POCI	(1 << 3)	/* port over current
+						   indicator */
+#define		RH_PS_PRS	(1 << 4)	/* port reset status */
+#define		RH_PS_PPS	(1 << 8)	/* port power status */
+#define		RH_PS_LSDA	(1 << 9)	/* low speed device attached */
+#define		RH_PS_CSC	(1 << 16)	/* connect status change */
+#define		RH_PS_PESC	(1 << 17)	/* port enable status change */
+#define		RH_PS_PSSC	(1 << 18)	/* port suspend status
+						   change */
+#define		RH_PS_OCIC	(1 << 19)	/* over current indicator
+						   change */
+#define		RH_PS_PRSC	(1 << 20)	/* port reset status change */
+#define		HCRHPORT_CLRMASK	(0x1f << 16)
+#define	HCRHPORT2	0x16
+#define	HCHWCFG		0x20
+#define		HCHWCFG_15KRSEL		(1 << 12)
+#define		HCHWCFG_CLKNOTSTOP	(1 << 11)
+#define		HCHWCFG_ANALOG_OC	(1 << 10)
+#define		HCHWCFG_DACK_MODE	(1 << 8)
+#define		HCHWCFG_EOT_POL		(1 << 7)
+#define		HCHWCFG_DACK_POL	(1 << 6)
+#define		HCHWCFG_DREQ_POL	(1 << 5)
+#define		HCHWCFG_DBWIDTH_MASK	(0x03 << 3)
+#define		HCHWCFG_DBWIDTH(n)	(((n) << 3) & HCHWCFG_DBWIDTH_MASK)
+#define		HCHWCFG_INT_POL		(1 << 2)
+#define		HCHWCFG_INT_TRIGGER	(1 << 1)
+#define		HCHWCFG_INT_ENABLE	(1 << 0)
+#define	HCDMACFG	0x21
+#define		HCDMACFG_BURST_LEN_MASK	(0x03 << 5)
+#define		HCDMACFG_BURST_LEN(n)	(((n) << 5) & HCDMACFG_BURST_LEN_MASK)
+#define		HCDMACFG_BURST_LEN_1	HCDMACFG_BURST_LEN(0)
+#define		HCDMACFG_BURST_LEN_4	HCDMACFG_BURST_LEN(1)
+#define		HCDMACFG_BURST_LEN_8	HCDMACFG_BURST_LEN(2)
+#define		HCDMACFG_DMA_ENABLE	(1 << 4)
+#define		HCDMACFG_BUF_TYPE_MASK	(0x07 << 1)
+#define		HCDMACFG_CTR_SEL	(1 << 2)
+#define		HCDMACFG_ITLATL_SEL	(1 << 1)
+#define		HCDMACFG_DMA_RW_SELECT	(1 << 0)
+#define	HCXFERCTR	0x22
+#define	HCuPINT		0x24
+#define		HCuPINT_SOF		(1 << 0)
+#define		HCuPINT_ATL		(1 << 1)
+#define		HCuPINT_AIIEOT		(1 << 2)
+#define		HCuPINT_OPR		(1 << 4)
+#define		HCuPINT_SUSP		(1 << 5)
+#define		HCuPINT_CLKRDY		(1 << 6)
+#define	HCuPINTENB	0x25
+#define	HCCHIPID	0x27
+#define		HCCHIPID_MASK		0xff00
+#define		HCCHIPID_MAGIC		0x6100
+#define	HCSCRATCH	0x28
+#define	HCSWRES		0x29
+#define		HCSWRES_MAGIC		0x00f6
+#define	HCITLBUFLEN	0x2a
+#define	HCATLBUFLEN	0x2b
+#define	HCBUFSTAT	0x2c
+#define		HCBUFSTAT_ITL0_FULL	(1 << 0)
+#define		HCBUFSTAT_ITL1_FULL	(1 << 1)
+#define		HCBUFSTAT_ATL_FULL	(1 << 2)
+#define		HCBUFSTAT_ITL0_DONE	(1 << 3)
+#define		HCBUFSTAT_ITL1_DONE	(1 << 4)
+#define		HCBUFSTAT_ATL_DONE	(1 << 5)
+#define	HCRDITL0LEN	0x2d
+#define	HCRDITL1LEN	0x2e
+#define	HCITLPORT	0x40
+#define	HCATLPORT	0x41
+
+/* Philips transfer descriptor */
+struct ptd {
+	u16 count;
+#define	PTD_COUNT_MSK	(0x3ff << 0)
+#define	PTD_TOGGLE_MSK	(1 << 10)
+#define	PTD_ACTIVE_MSK	(1 << 11)
+#define	PTD_CC_MSK	(0xf << 12)
+	u16 mps;
+#define	PTD_MPS_MSK	(0x3ff << 0)
+#define	PTD_SPD_MSK	(1 << 10)
+#define	PTD_LAST_MSK	(1 << 11)
+#define	PTD_EP_MSK	(0xf << 12)
+	u16 len;
+#define	PTD_LEN_MSK	(0x3ff << 0)
+#define	PTD_DIR_MSK	(3 << 10)
+#define	PTD_DIR_SETUP	(0)
+#define	PTD_DIR_OUT	(1)
+#define	PTD_DIR_IN	(2)
+#define	PTD_B5_5_MSK	(1 << 13)
+	u16 faddr;
+#define	PTD_FA_MSK	(0x7f << 0)
+#define	PTD_FMT_MSK	(1 << 7)
+} __attribute__ ((packed, aligned(2)));
+
+/* PTD accessor macros. */
+#define PTD_GET_COUNT(p)	(((p)->count & PTD_COUNT_MSK) >> 0)
+#define PTD_COUNT(v)		(((v) << 0) & PTD_COUNT_MSK)
+#define PTD_GET_TOGGLE(p)	(((p)->count & PTD_TOGGLE_MSK) >> 10)
+#define PTD_TOGGLE(v)		(((v) << 10) & PTD_TOGGLE_MSK)
+#define PTD_GET_ACTIVE(p)	(((p)->count & PTD_ACTIVE_MSK) >> 11)
+#define PTD_ACTIVE(v)		(((v) << 11) & PTD_ACTIVE_MSK)
+#define PTD_GET_CC(p)		(((p)->count & PTD_CC_MSK) >> 12)
+#define PTD_CC(v)		(((v) << 12) & PTD_CC_MSK)
+#define PTD_GET_MPS(p)		(((p)->mps & PTD_MPS_MSK) >> 0)
+#define PTD_MPS(v)		(((v) << 0) & PTD_MPS_MSK)
+#define PTD_GET_SPD(p)		(((p)->mps & PTD_SPD_MSK) >> 10)
+#define PTD_SPD(v)		(((v) << 10) & PTD_SPD_MSK)
+#define PTD_GET_LAST(p)		(((p)->mps & PTD_LAST_MSK) >> 11)
+#define PTD_LAST(v)		(((v) << 11) & PTD_LAST_MSK)
+#define PTD_GET_EP(p)		(((p)->mps & PTD_EP_MSK) >> 12)
+#define PTD_EP(v)		(((v) << 12) & PTD_EP_MSK)
+#define PTD_GET_LEN(p)		(((p)->len & PTD_LEN_MSK) >> 0)
+#define PTD_LEN(v)		(((v) << 0) & PTD_LEN_MSK)
+#define PTD_GET_DIR(p)		(((p)->len & PTD_DIR_MSK) >> 10)
+#define PTD_DIR(v)		(((v) << 10) & PTD_DIR_MSK)
+#define PTD_GET_B5_5(p)		(((p)->len & PTD_B5_5_MSK) >> 13)
+#define PTD_B5_5(v)		(((v) << 13) & PTD_B5_5_MSK)
+#define PTD_GET_FA(p)		(((p)->faddr & PTD_FA_MSK) >> 0)
+#define PTD_FA(v)		(((v) << 0) & PTD_FA_MSK)
+#define PTD_GET_FMT(p)		(((p)->faddr & PTD_FMT_MSK) >> 7)
+#define PTD_FMT(v)		(((v) << 7) & PTD_FMT_MSK)
+
+/*  Hardware transfer status codes -- CC from ptd->count */
+#define TD_CC_NOERROR      0x00
+#define TD_CC_CRC          0x01
+#define TD_CC_BITSTUFFING  0x02
+#define TD_CC_DATATOGGLEM  0x03
+#define TD_CC_STALL        0x04
+#define TD_DEVNOTRESP      0x05
+#define TD_PIDCHECKFAIL    0x06
+#define TD_UNEXPECTEDPID   0x07
+#define TD_DATAOVERRUN     0x08
+#define TD_DATAUNDERRUN    0x09
+    /* 0x0A, 0x0B reserved for hardware */
+#define TD_BUFFEROVERRUN   0x0C
+#define TD_BUFFERUNDERRUN  0x0D
+    /* 0x0E, 0x0F reserved for HCD */
+#define TD_NOTACCESSED     0x0F
+
+/* map PTD status codes (CC) to errno values */
+static const int cc_to_error[16] = {
+	/* No  Error  */ 0,
+	/* CRC Error  */ -EILSEQ,
+	/* Bit Stuff  */ -EPROTO,
+	/* Data Togg  */ -EILSEQ,
+	/* Stall      */ -EPIPE,
+	/* DevNotResp */ -ETIMEDOUT,
+	/* PIDCheck   */ -EPROTO,
+	/* UnExpPID   */ -EPROTO,
+	/* DataOver   */ -EOVERFLOW,
+	/* DataUnder  */ -EREMOTEIO,
+	/* (for hw)   */ -EIO,
+	/* (for hw)   */ -EIO,
+	/* BufferOver */ -ECOMM,
+	/* BuffUnder  */ -ENOSR,
+	/* (for HCD)  */ -EALREADY,
+	/* (for HCD)  */ -EALREADY
+};
+
+/*--------------------------------------------------------------*/
+
+#define	LOG2_PERIODIC_SIZE	5	/* arbitrary; this matches OHCI */
+#define	PERIODIC_SIZE		(1 << LOG2_PERIODIC_SIZE)
+
+struct isp116x {
+	spinlock_t lock;
+	struct work_struct rh_resume;
+
+	void __iomem *addr_reg;
+	void __iomem *data_reg;
+
+	struct isp116x_platform_data *board;
+
+	struct proc_dir_entry *pde;
+	unsigned long stat1, stat2, stat4, stat8, stat16;
+
+	/* HC registers */
+	u32 intenb;		/* "OHCI" interrupts */
+	u16 irqenb;		/* uP interrupts */
+
+	/* Root hub registers */
+	u32 rhdesca;
+	u32 rhdescb;
+	u32 rhstatus;
+	u32 rhport[2];
+
+	/* async schedule: control, bulk */
+	struct list_head async;
+
+	/* periodic schedule: int */
+	u16 load[PERIODIC_SIZE];
+	struct isp116x_ep *periodic[PERIODIC_SIZE];
+	unsigned periodic_count;
+	u16 fmindex;
+
+	/* Schedule for the current frame */
+	struct isp116x_ep *atl_active;
+	int atl_buflen;
+	int atl_bufshrt;
+	int atl_last_dir;
+	atomic_t atl_finishing;
+};
+
+static inline struct isp116x *hcd_to_isp116x(struct usb_hcd *hcd)
+{
+	return (struct isp116x *)(hcd->hcd_priv);
+}
+
+static inline struct usb_hcd *isp116x_to_hcd(struct isp116x *isp116x)
+{
+	return container_of((void *)isp116x, struct usb_hcd, hcd_priv);
+}
+
+struct isp116x_ep {
+	struct usb_host_endpoint *hep;
+	struct usb_device *udev;
+	struct ptd ptd;
+
+	u8 maxpacket;
+	u8 epnum;
+	u8 nextpid;
+	u16 error_count;
+	u16 length;		/* of current packet */
+	unsigned char *data;	/* to databuf */
+	/* queue of active EP's (the ones scheduled for the
+	   current frame) */
+	struct isp116x_ep *active;
+
+	/* periodic schedule */
+	u16 period;
+	u16 branch;
+	u16 load;
+	struct isp116x_ep *next;
+
+	/* async schedule */
+	struct list_head schedule;
+};
+
+/*-------------------------------------------------------------------------*/
+
+#ifdef DEBUG
+#define DBG(stuff...)		printk(KERN_DEBUG "116x: " stuff)
+#else
+#define DBG(stuff...)		do{}while(0)
+#endif
+
+#ifdef VERBOSE
+#    define VDBG		DBG
+#else
+#    define VDBG(stuff...)	do{}while(0)
+#endif
+
+#define ERR(stuff...)		printk(KERN_ERR "116x: " stuff)
+#define WARN(stuff...)		printk(KERN_WARNING "116x: " stuff)
+#define INFO(stuff...)		printk(KERN_INFO "116x: " stuff)
+
+/* ------------------------------------------------- */
+
+#if defined(USE_PLATFORM_DELAY)
+#if defined(USE_NDELAY)
+#error USE_PLATFORM_DELAY and USE_NDELAY simultaneously defined.
+#endif
+#define	isp116x_delay(h,d)	(h)->board->delay(	\
+				isp116x_to_hcd(h)->self.controller,d)
+#define isp116x_check_platform_delay(h)	((h)->board->delay == NULL)
+#elif defined(USE_NDELAY)
+#define	isp116x_delay(h,d)	ndelay(d)
+#define isp116x_check_platform_delay(h)	0
+#else
+#define	isp116x_delay(h,d)	do{}while(0)
+#define isp116x_check_platform_delay(h)	0
+#endif
+
+#if defined(DEBUG)
+#define	IRQ_TEST()	BUG_ON(!irqs_disabled())
+#else
+#define	IRQ_TEST()	do{}while(0)
+#endif
+
+static inline void isp116x_write_addr(struct isp116x *isp116x, unsigned reg)
+{
+	IRQ_TEST();
+	writew(reg & 0xff, isp116x->addr_reg);
+	isp116x_delay(isp116x, 300);
+}
+
+static inline void isp116x_write_data16(struct isp116x *isp116x, u16 val)
+{
+	writew(val, isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+}
+
+static inline void isp116x_raw_write_data16(struct isp116x *isp116x, u16 val)
+{
+	__raw_writew(val, isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+}
+
+static inline u16 isp116x_read_data16(struct isp116x *isp116x)
+{
+	u16 val;
+
+	val = readw(isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+	return val;
+}
+
+static inline u16 isp116x_raw_read_data16(struct isp116x *isp116x)
+{
+	u16 val;
+
+	val = __raw_readw(isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+	return val;
+}
+
+static inline void isp116x_write_data32(struct isp116x *isp116x, u32 val)
+{
+	writew(val & 0xffff, isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+	writew(val >> 16, isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+}
+
+static inline u32 isp116x_read_data32(struct isp116x *isp116x)
+{
+	u32 val;
+
+	val = (u32) readw(isp116x->data_reg);
+	isp116x_delay(isp116x, 150);
+	val |= ((u32) readw(isp116x->data_reg)) << 16;
+	isp116x_delay(isp116x, 150);
+	return val;
+}
+
+/* Let's keep register access functions out of line. Hint:
+   we wait at least 150 ns at every access.
+*/
+static u16 isp116x_read_reg16(struct isp116x *isp116x, unsigned reg)
+{
+	isp116x_write_addr(isp116x, reg);
+	return isp116x_read_data16(isp116x);
+}
+
+static u32 isp116x_read_reg32(struct isp116x *isp116x, unsigned reg)
+{
+	isp116x_write_addr(isp116x, reg);
+	return isp116x_read_data32(isp116x);
+}
+
+static void isp116x_write_reg16(struct isp116x *isp116x, unsigned reg,
+				unsigned val)
+{
+	isp116x_write_addr(isp116x, reg | ISP116x_WRITE_OFFSET);
+	isp116x_write_data16(isp116x, (u16) (val & 0xffff));
+}
+
+static void isp116x_write_reg32(struct isp116x *isp116x, unsigned reg,
+				unsigned val)
+{
+	isp116x_write_addr(isp116x, reg | ISP116x_WRITE_OFFSET);
+	isp116x_write_data32(isp116x, (u32) val);
+}
+
+#define isp116x_show_reg(d,r) {					\
+	if ((r) < 0x20) {			                \
+		DBG("%-12s[%02x]: %08x\n", #r,			\
+			r, isp116x_read_reg32(d, r));		\
+	} else {						\
+		DBG("%-12s[%02x]:     %04x\n", #r,		\
+			r, isp116x_read_reg16(d, r));	    	\
+	}							\
+}
+
+static inline void isp116x_show_regs(struct isp116x *isp116x)
+{
+	isp116x_show_reg(isp116x, HCREVISION);
+	isp116x_show_reg(isp116x, HCCONTROL);
+	isp116x_show_reg(isp116x, HCCMDSTAT);
+	isp116x_show_reg(isp116x, HCINTSTAT);
+	isp116x_show_reg(isp116x, HCINTENB);
+	isp116x_show_reg(isp116x, HCFMINTVL);
+	isp116x_show_reg(isp116x, HCFMREM);
+	isp116x_show_reg(isp116x, HCFMNUM);
+	isp116x_show_reg(isp116x, HCLSTHRESH);
+	isp116x_show_reg(isp116x, HCRHDESCA);
+	isp116x_show_reg(isp116x, HCRHDESCB);
+	isp116x_show_reg(isp116x, HCRHSTATUS);
+	isp116x_show_reg(isp116x, HCRHPORT1);
+	isp116x_show_reg(isp116x, HCRHPORT2);
+	isp116x_show_reg(isp116x, HCHWCFG);
+	isp116x_show_reg(isp116x, HCDMACFG);
+	isp116x_show_reg(isp116x, HCXFERCTR);
+	isp116x_show_reg(isp116x, HCuPINT);
+	isp116x_show_reg(isp116x, HCuPINTENB);
+	isp116x_show_reg(isp116x, HCCHIPID);
+	isp116x_show_reg(isp116x, HCSCRATCH);
+	isp116x_show_reg(isp116x, HCITLBUFLEN);
+	isp116x_show_reg(isp116x, HCATLBUFLEN);
+	isp116x_show_reg(isp116x, HCBUFSTAT);
+	isp116x_show_reg(isp116x, HCRDITL0LEN);
+	isp116x_show_reg(isp116x, HCRDITL1LEN);
+}
+
+#if defined(URB_TRACE)
+
+#define PIPETYPE(pipe)  ({ char *__s;			\
+	if (usb_pipecontrol(pipe))	__s = "ctrl";	\
+	else if (usb_pipeint(pipe))	__s = "int";	\
+	else if (usb_pipebulk(pipe))	__s = "bulk";	\
+	else				__s = "iso";	\
+	__s;})
+#define PIPEDIR(pipe)   ({ usb_pipein(pipe) ? "in" : "out"; })
+#define URB_NOTSHORT(urb) ({ (urb)->transfer_flags & URB_SHORT_NOT_OK ? \
+	"short_not_ok" : ""; })
+
+/* print debug info about the URB */
+static void urb_dbg(struct urb *urb, char *msg)
+{
+	unsigned int pipe;
+
+	if (!urb) {
+		DBG("%s: zero urb\n", msg);
+		return;
+	}
+	pipe = urb->pipe;
+	DBG("%s: FA %d ep%d%s %s: len %d/%d %s\n", msg,
+	    usb_pipedevice(pipe), usb_pipeendpoint(pipe),
+	    PIPEDIR(pipe), PIPETYPE(pipe),
+	    urb->transfer_buffer_length, urb->actual_length, URB_NOTSHORT(urb));
+}
+
+#else
+
+#define  urb_dbg(urb,msg)   do{}while(0)
+
+#endif				/* ! defined(URB_TRACE) */
+
+#if defined(PTD_TRACE)
+
+#define PTD_DIR_STR(ptd)  ({char __c;		\
+	switch(PTD_GET_DIR(ptd)){		\
+	case 0:  __c = 's'; break;		\
+	case 1:  __c = 'o'; break;		\
+	default: __c = 'i'; break;		\
+	}; __c;})
+
+/*
+  Dump PTD info. The code documents the format
+  perfectly, right :)
+*/
+static inline void dump_ptd(struct ptd *ptd)
+{
+	printk("td: %x %d%c%d %d,%d,%d  %x %x%x%x\n",
+	       PTD_GET_CC(ptd), PTD_GET_FA(ptd),
+	       PTD_DIR_STR(ptd), PTD_GET_EP(ptd),
+	       PTD_GET_COUNT(ptd), PTD_GET_LEN(ptd), PTD_GET_MPS(ptd),
+	       PTD_GET_TOGGLE(ptd), PTD_GET_ACTIVE(ptd),
+	       PTD_GET_SPD(ptd), PTD_GET_LAST(ptd));
+}
+
+static inline void dump_ptd_out_data(struct ptd *ptd, u8 * buf)
+{
+	int k;
+
+	if (PTD_GET_DIR(ptd) != PTD_DIR_IN && PTD_GET_LEN(ptd)) {
+		printk("-> ");
+		for (k = 0; k < PTD_GET_LEN(ptd); ++k)
+			printk("%02x ", ((u8 *) buf)[k]);
+		printk("\n");
+	}
+}
+
+static inline void dump_ptd_in_data(struct ptd *ptd, u8 * buf)
+{
+	int k;
+
+	if (PTD_GET_DIR(ptd) == PTD_DIR_IN && PTD_GET_COUNT(ptd)) {
+		printk("<- ");
+		for (k = 0; k < PTD_GET_COUNT(ptd); ++k)
+			printk("%02x ", ((u8 *) buf)[k]);
+		printk("\n");
+	}
+	if (PTD_GET_LAST(ptd))
+		printk("-\n");
+}
+
+#else
+
+#define dump_ptd(ptd)               do{}while(0)
+#define dump_ptd_in_data(ptd,buf)   do{}while(0)
+#define dump_ptd_out_data(ptd,buf)  do{}while(0)
+
+#endif				/* ! defined(PTD_TRACE) */
diff --git a/include/linux/usb_isp116x.h b/include/linux/usb_isp116x.h
new file mode 100644
index 000000000000..5f5a9d9bd6c2
--- /dev/null
+++ b/include/linux/usb_isp116x.h
@@ -0,0 +1,47 @@
+
+/*
+ * Board initialization code should put one of these into dev->platform_data
+ * and place the isp116x onto platform_bus.
+ */
+
+struct isp116x_platform_data {
+	/* Enable internal resistors on downstream ports */
+	unsigned sel15Kres:1;
+	/* Chip's internal clock won't be stopped in suspended state.
+	   Setting/unsetting this bit takes effect only if
+	   'remote_wakeup_enable' below is not set. */
+	unsigned clknotstop:1;
+	/* On-chip overcurrent protection */
+	unsigned oc_enable:1;
+	/* INT output polarity */
+	unsigned int_act_high:1;
+	/* INT edge or level triggered */
+	unsigned int_edge_triggered:1;
+	/* WAKEUP pin connected - NOT SUPPORTED  */
+	/* unsigned remote_wakeup_connected:1; */
+	/* Wakeup by devices on usb bus enabled */
+	unsigned remote_wakeup_enable:1;
+	/* Switch or not to switch (keep always powered) */
+	unsigned no_power_switching:1;
+	/* Ganged port power switching (0) or individual port
+	   power switching (1) */
+	unsigned power_switching_mode:1;
+	/* Given port_power, msec/2 after power on till power good */
+	u8 potpg;
+	/* Hardware reset set/clear. If implemented, this function must:
+	   if set == 0,   deassert chip's HW reset pin
+	   otherwise,     assert chip's HW reset pin       */
+	void (*reset) (struct device * dev, int set);
+	/* Hardware clock start/stop. If implemented, this function must:
+	   if start == 0,    stop the external clock
+	   otherwise,        start the external clock
+	 */
+	void (*clock) (struct device * dev, int start);
+	/* Inter-io delay (ns). The chip is picky about access timings; it
+	   expects at least:
+	   150ns delay between consecutive accesses to DATA_REG,
+	   300ns delay between access to ADDR_REG and DATA_REG
+	   OE, WE MUST NOT be changed during these intervals
+	 */
+	void (*delay) (struct device * dev, int delay);
+};
-- 
cgit v1.2.3-59-g8ed1b


From 1bbc169621cbe502b9143a27eb12802a0f1d43a0 Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Sat, 7 May 2005 13:05:13 -0700
Subject: [PATCH] USB: gadget driver updates (SETUP api change)

This updates most of the gadget framework to expect SETUP packets use
USB byteorder (matching the annotation in <linux/usb_ch9.h> and usage
in the host side stack):

  - definition in <linux/usb_gadget.h>
  - gadget drivers:  Ethernet/RNDIS, serial/ACM, file_storage, gadgetfs.
  - dummy_hcd

It also includes some other similar changes as suggested by "sparse",
which was used to detect byteorder bugs.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/gadget/dummy_hcd.c    |  3 ---
 drivers/usb/gadget/ether.c        |  6 +++---
 drivers/usb/gadget/file_storage.c | 19 +++++++++----------
 drivers/usb/gadget/inode.c        | 12 ++++++------
 drivers/usb/gadget/serial.c       | 36 +++++++++++++++++++-----------------
 drivers/usb/gadget/zero.c         |  6 +++---
 include/linux/usb_gadget.h        |  2 +-
 7 files changed, 41 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index 73d2f24050ab..f9540adf2a4f 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -1267,9 +1267,6 @@ restart:
 			struct dummy_ep			*ep2;
 
 			setup = *(struct usb_ctrlrequest*) urb->setup_packet;
-			le16_to_cpus (&setup.wIndex);
-			le16_to_cpus (&setup.wValue);
-			le16_to_cpus (&setup.wLength);
 			if (setup.wLength != urb->transfer_buffer_length) {
 				maybe_set_status (urb, -EOVERFLOW);
 				goto return_urb;
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 3830a0a0fd50..9f8413e3c10a 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -1277,9 +1277,9 @@ eth_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl)
 	struct eth_dev		*dev = get_gadget_data (gadget);
 	struct usb_request	*req = dev->req;
 	int			value = -EOPNOTSUPP;
-	u16			wIndex = (__force u16) ctrl->wIndex;
-	u16			wValue = (__force u16) ctrl->wValue;
-	u16			wLength = (__force u16) ctrl->wLength;
+	u16			wIndex = le16_to_cpu(ctrl->wIndex);
+	u16			wValue = le16_to_cpu(ctrl->wValue);
+	u16			wLength = le16_to_cpu(ctrl->wLength);
 
 	/* descriptors just go into the pre-allocated ep0 buffer,
 	 * while config change events may enable network traffic.
diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c
index f5ce45c4b2a3..4f57085619b4 100644
--- a/drivers/usb/gadget/file_storage.c
+++ b/drivers/usb/gadget/file_storage.c
@@ -819,7 +819,7 @@ static void inline put_be32(u8 *buf, u32 val)
 	buf[0] = val >> 24;
 	buf[1] = val >> 16;
 	buf[2] = val >> 8;
-	buf[3] = val;
+	buf[3] = val & 0xff;
 }
 
 
@@ -1277,8 +1277,8 @@ static int class_setup_req(struct fsg_dev *fsg,
 {
 	struct usb_request	*req = fsg->ep0req;
 	int			value = -EOPNOTSUPP;
-	u16			w_index = ctrl->wIndex;
-	u16			w_length = ctrl->wLength;
+	u16			w_index = le16_to_cpu(ctrl->wIndex);
+	u16			w_length = le16_to_cpu(ctrl->wLength);
 
 	if (!fsg->config)
 		return value;
@@ -1345,7 +1345,7 @@ static int class_setup_req(struct fsg_dev *fsg,
 			"unknown class-specific control req "
 			"%02x.%02x v%04x i%04x l%u\n",
 			ctrl->bRequestType, ctrl->bRequest,
-			ctrl->wValue, w_index, w_length);
+			le16_to_cpu(ctrl->wValue), w_index, w_length);
 	return value;
 }
 
@@ -1359,8 +1359,8 @@ static int standard_setup_req(struct fsg_dev *fsg,
 {
 	struct usb_request	*req = fsg->ep0req;
 	int			value = -EOPNOTSUPP;
-	u16			w_index = ctrl->wIndex;
-	u16			w_value = ctrl->wValue;
+	u16			w_index = le16_to_cpu(ctrl->wIndex);
+	u16			w_value = le16_to_cpu(ctrl->wValue);
 
 	/* Usually this just stores reply data in the pre-allocated ep0 buffer,
 	 * but config change events will also reconfigure hardware. */
@@ -1469,7 +1469,7 @@ static int standard_setup_req(struct fsg_dev *fsg,
 		VDBG(fsg,
 			"unknown control req %02x.%02x v%04x i%04x l%u\n",
 			ctrl->bRequestType, ctrl->bRequest,
-			w_value, w_index, ctrl->wLength);
+			w_value, w_index, le16_to_cpu(ctrl->wLength));
 	}
 
 	return value;
@@ -1481,7 +1481,7 @@ static int fsg_setup(struct usb_gadget *gadget,
 {
 	struct fsg_dev		*fsg = get_gadget_data(gadget);
 	int			rc;
-	int			w_length = ctrl->wLength;
+	int			w_length = le16_to_cpu(ctrl->wLength);
 
 	++fsg->ep0_req_tag;		// Record arrival of a new request
 	fsg->ep0req->context = NULL;
@@ -1497,8 +1497,7 @@ static int fsg_setup(struct usb_gadget *gadget,
 	if (rc >= 0 && rc != DELAYED_STATUS) {
 		rc = min(rc, w_length);
 		fsg->ep0req->length = rc;
-		fsg->ep0req->zero = (rc < w_length &&
-				(rc % gadget->ep0->maxpacket) == 0);
+		fsg->ep0req->zero = rc < w_length;
 		fsg->ep0req_name = (ctrl->bRequestType & USB_DIR_IN ?
 				"ep0-in" : "ep0-out");
 		rc = ep0_queue(fsg);
diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 1e5e6ddef787..020815397a49 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -417,8 +417,8 @@ ep_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr)
 		goto free1;
 
 	value = ep_io (data, kbuf, len);
-	VDEBUG (data->dev, "%s read %d OUT, status %d\n",
-		data->name, len, value);
+	VDEBUG (data->dev, "%s read %zu OUT, status %d\n",
+		data->name, len, (int) value);
 	if (value >= 0 && copy_to_user (buf, kbuf, value))
 		value = -EFAULT;
 
@@ -465,8 +465,8 @@ ep_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr)
 	}
 
 	value = ep_io (data, kbuf, len);
-	VDEBUG (data->dev, "%s write %d IN, status %d\n",
-		data->name, len, value);
+	VDEBUG (data->dev, "%s write %zu IN, status %d\n",
+		data->name, len, (int) value);
 free1:
 	up (&data->lock);
 	kfree (kbuf);
@@ -1318,8 +1318,8 @@ gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl)
 	struct usb_request		*req = dev->req;
 	int				value = -EOPNOTSUPP;
 	struct usb_gadgetfs_event	*event;
-	u16				w_value = ctrl->wValue;
-	u16				w_length = ctrl->wLength;
+	u16				w_value = le16_to_cpu(ctrl->wValue);
+	u16				w_length = le16_to_cpu(ctrl->wLength);
 
 	spin_lock (&dev->lock);
 	dev->setup_abort = 0;
diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c
index 4d591c764e38..9e4f1c6935a5 100644
--- a/drivers/usb/gadget/serial.c
+++ b/drivers/usb/gadget/serial.c
@@ -300,18 +300,18 @@ static int gs_build_config_buf(u8 *buf, enum usb_device_speed speed,
 		u8 type, unsigned int index, int is_otg);
 
 static struct usb_request *gs_alloc_req(struct usb_ep *ep, unsigned int len,
-	int kmalloc_flags);
+	unsigned kmalloc_flags);
 static void gs_free_req(struct usb_ep *ep, struct usb_request *req);
 
 static struct gs_req_entry *gs_alloc_req_entry(struct usb_ep *ep, unsigned len,
-	int kmalloc_flags);
+	unsigned kmalloc_flags);
 static void gs_free_req_entry(struct usb_ep *ep, struct gs_req_entry *req);
 
-static int gs_alloc_ports(struct gs_dev *dev, int kmalloc_flags);
+static int gs_alloc_ports(struct gs_dev *dev, unsigned kmalloc_flags);
 static void gs_free_ports(struct gs_dev *dev);
 
 /* circular buffer */
-static struct gs_buf *gs_buf_alloc(unsigned int size, int kmalloc_flags);
+static struct gs_buf *gs_buf_alloc(unsigned int size, unsigned kmalloc_flags);
 static void gs_buf_free(struct gs_buf *gb);
 static void gs_buf_clear(struct gs_buf *gb);
 static unsigned int gs_buf_data_avail(struct gs_buf *gb);
@@ -1607,9 +1607,9 @@ static int gs_setup(struct usb_gadget *gadget,
 	int ret = -EOPNOTSUPP;
 	struct gs_dev *dev = get_gadget_data(gadget);
 	struct usb_request *req = dev->dev_ctrl_req;
-	u16 wIndex = ctrl->wIndex;
-	u16 wValue = ctrl->wValue;
-	u16 wLength = ctrl->wLength;
+	u16 wIndex = le16_to_cpu(ctrl->wIndex);
+	u16 wValue = le16_to_cpu(ctrl->wValue);
+	u16 wLength = le16_to_cpu(ctrl->wLength);
 
 	switch (ctrl->bRequestType & USB_TYPE_MASK) {
 	case USB_TYPE_STANDARD:
@@ -1651,9 +1651,9 @@ static int gs_setup_standard(struct usb_gadget *gadget,
 	int ret = -EOPNOTSUPP;
 	struct gs_dev *dev = get_gadget_data(gadget);
 	struct usb_request *req = dev->dev_ctrl_req;
-	u16 wIndex = ctrl->wIndex;
-	u16 wValue = ctrl->wValue;
-	u16 wLength = ctrl->wLength;
+	u16 wIndex = le16_to_cpu(ctrl->wIndex);
+	u16 wValue = le16_to_cpu(ctrl->wValue);
+	u16 wLength = le16_to_cpu(ctrl->wLength);
 
 	switch (ctrl->bRequest) {
 	case USB_REQ_GET_DESCRIPTOR:
@@ -1782,9 +1782,9 @@ static int gs_setup_class(struct usb_gadget *gadget,
 	struct gs_dev *dev = get_gadget_data(gadget);
 	struct gs_port *port = dev->dev_port[0];	/* ACM only has one port */
 	struct usb_request *req = dev->dev_ctrl_req;
-	u16 wIndex = ctrl->wIndex;
-	u16 wValue = ctrl->wValue;
-	u16 wLength = ctrl->wLength;
+	u16 wIndex = le16_to_cpu(ctrl->wIndex);
+	u16 wValue = le16_to_cpu(ctrl->wValue);
+	u16 wLength = le16_to_cpu(ctrl->wLength);
 
 	switch (ctrl->bRequest) {
 	case USB_CDC_REQ_SET_LINE_CODING:
@@ -2119,7 +2119,8 @@ static int gs_build_config_buf(u8 *buf, enum usb_device_speed speed,
  * Allocate a usb_request and its buffer.  Returns a pointer to the
  * usb_request or NULL if there is an error.
  */
-static struct usb_request *gs_alloc_req(struct usb_ep *ep, unsigned int len, int kmalloc_flags)
+static struct usb_request *
+gs_alloc_req(struct usb_ep *ep, unsigned int len, unsigned kmalloc_flags)
 {
 	struct usb_request *req;
 
@@ -2159,7 +2160,8 @@ static void gs_free_req(struct usb_ep *ep, struct usb_request *req)
  * Allocates a request and its buffer, using the given
  * endpoint, buffer len, and kmalloc flags.
  */
-static struct gs_req_entry *gs_alloc_req_entry(struct usb_ep *ep, unsigned len, int kmalloc_flags)
+static struct gs_req_entry *
+gs_alloc_req_entry(struct usb_ep *ep, unsigned len, unsigned kmalloc_flags)
 {
 	struct gs_req_entry	*req;
 
@@ -2200,7 +2202,7 @@ static void gs_free_req_entry(struct usb_ep *ep, struct gs_req_entry *req)
  *
  * The device lock is normally held when calling this function.
  */
-static int gs_alloc_ports(struct gs_dev *dev, int kmalloc_flags)
+static int gs_alloc_ports(struct gs_dev *dev, unsigned kmalloc_flags)
 {
 	int i;
 	struct gs_port *port;
@@ -2282,7 +2284,7 @@ static void gs_free_ports(struct gs_dev *dev)
  *
  * Allocate a circular buffer and all associated memory.
  */
-static struct gs_buf *gs_buf_alloc(unsigned int size, int kmalloc_flags)
+static struct gs_buf *gs_buf_alloc(unsigned int size, unsigned kmalloc_flags)
 {
 	struct gs_buf *gb;
 
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index 6e49432071a1..a6e035e24479 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -919,9 +919,9 @@ zero_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl)
 	struct zero_dev		*dev = get_gadget_data (gadget);
 	struct usb_request	*req = dev->req;
 	int			value = -EOPNOTSUPP;
-	u16			w_index = ctrl->wIndex;
-	u16			w_value = ctrl->wValue;
-	u16			w_length = ctrl->wLength;
+	u16			w_index = le16_to_cpu(ctrl->wIndex);
+	u16			w_value = le16_to_cpu(ctrl->wValue);
+	u16			w_length = le16_to_cpu(ctrl->wLength);
 
 	/* usually this stores reply data in the pre-allocated ep0 buffer,
 	 * but config change events will reconfigure hardware.
diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h
index 9bba9997947b..b00f127cb447 100644
--- a/include/linux/usb_gadget.h
+++ b/include/linux/usb_gadget.h
@@ -711,7 +711,7 @@ usb_gadget_disconnect (struct usb_gadget *gadget)
  * 	the hardware level driver. Most calls must be handled by
  * 	the gadget driver, including descriptor and configuration
  * 	management.  The 16 bit members of the setup data are in
- * 	cpu order. Called in_interrupt; this may not sleep.  Driver
+ * 	USB byte order. Called in_interrupt; this may not sleep.  Driver
  *	queues a response to ep0, or returns negative to stall.
  * @disconnect: Invoked after all transfers have been stopped,
  * 	when the host is disconnected.  May be called in_interrupt; this
-- 
cgit v1.2.3-59-g8ed1b


From 5da0106f0b9b13afa4a902c01d4c98b002df55ff Mon Sep 17 00:00:00 2001
From: David Brownell <david-b@pacbell.net>
Date: Tue, 31 May 2005 10:21:11 -0700
Subject: [PATCH] USB: wireless usb <linux/usb_ch9.h> declarations

This provides declarations for new requests, descriptors, and bitfields as
defined in the Wireless USB 1.0 spec.  Device support will involve a new
"Wire Adapter" device class, connecting a USB Host to a cluster of wireless
USB devices.  There will be two adapter types:

  * Host Wireless Adapter (HWA):  the downstream link is wireless, which
    connects a wireless USB host to wireless USB devices (not unlike like
    a hub) including to the second type of adapter.

  * Device Wireless Adapter (DWA): the upstream link is wireless, for
    connecting existing USB devices through wired links into the cluser.

All wireless USB devices will need persistent (and secure!) key storage, and
it's probable that Linux -- or device firmware -- will need to be involved
with that to bootstrap the initial secure key exchange.

Some user interface is required in that initial key exchange, and since the
most "hands-off" one is a wired USB link, I suspect wireless operation will
usually not be the only mode for wireless USB devices.  (Plus, devices can
recharge batteries using wired USB...)  All other key exchange protocols need
error prone user interactions, like copying and/or verifying keys.

It'll likely be a while before we have commercial Wireless USB hardware,
much less Linux implementations that know how to use it.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb_ch9.h | 183 ++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 176 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb_ch9.h b/include/linux/usb_ch9.h
index f5fe94e09a03..39e7ff4ffd28 100644
--- a/include/linux/usb_ch9.h
+++ b/include/linux/usb_ch9.h
@@ -6,11 +6,14 @@
  *
  * - the master/host side Linux-USB kernel driver API;
  * - the "usbfs" user space API; and
- * - (eventually) a Linux "gadget" slave/device side driver API.
+ * - the Linux "gadget" slave/device/peripheral side driver API.
  *
  * USB 2.0 adds an additional "On The Go" (OTG) mode, which lets systems
  * act either as a USB master/host or as a USB slave/device.  That means
- * the master and slave side APIs will benefit from working well together.
+ * the master and slave side APIs benefit from working well together.
+ *
+ * There's also "Wireless USB", using low power short range radios for
+ * peripheral interconnection but otherwise building on the USB framework.
  */
 
 #ifndef __LINUX_USB_CH9_H
@@ -68,6 +71,18 @@
 #define USB_REQ_SET_INTERFACE		0x0B
 #define USB_REQ_SYNCH_FRAME		0x0C
 
+#define USB_REQ_SET_ENCRYPTION		0x0D	/* Wireless USB */
+#define USB_REQ_GET_ENCRYPTION		0x0E
+#define USB_REQ_SET_HANDSHAKE		0x0F
+#define USB_REQ_GET_HANDSHAKE		0x10
+#define USB_REQ_SET_CONNECTION		0x11
+#define USB_REQ_SET_SECURITY_DATA	0x12
+#define USB_REQ_GET_SECURITY_DATA	0x13
+#define USB_REQ_SET_WUSB_DATA		0x14
+#define USB_REQ_LOOPBACK_DATA_WRITE	0x15
+#define USB_REQ_LOOPBACK_DATA_READ	0x16
+#define USB_REQ_SET_INTERFACE_DS	0x17
+
 /*
  * USB feature flags are written using USB_REQ_{CLEAR,SET}_FEATURE, and
  * are read as a bit array returned by USB_REQ_GET_STATUS.  (So there
@@ -75,10 +90,12 @@
  */
 #define USB_DEVICE_SELF_POWERED		0	/* (read only) */
 #define USB_DEVICE_REMOTE_WAKEUP	1	/* dev may initiate wakeup */
-#define USB_DEVICE_TEST_MODE		2	/* (high speed only) */
-#define USB_DEVICE_B_HNP_ENABLE		3	/* dev may initiate HNP */
-#define USB_DEVICE_A_HNP_SUPPORT	4	/* RH port supports HNP */
-#define USB_DEVICE_A_ALT_HNP_SUPPORT	5	/* other RH port does */
+#define USB_DEVICE_TEST_MODE		2	/* (wired high speed only) */
+#define USB_DEVICE_BATTERY		2	/* (wireless) */
+#define USB_DEVICE_B_HNP_ENABLE		3	/* (otg) dev may initiate HNP */
+#define USB_DEVICE_WUSB_DEVICE		3	/* (wireless)*/
+#define USB_DEVICE_A_HNP_SUPPORT	4	/* (otg) RH port supports HNP */
+#define USB_DEVICE_A_ALT_HNP_SUPPORT	5	/* (otg) other RH port does */
 #define USB_DEVICE_DEBUG_MODE		6	/* (special devices only) */
 
 #define USB_ENDPOINT_HALT		0	/* IN/OUT will STALL */
@@ -135,6 +152,13 @@ struct usb_ctrlrequest {
 #define USB_DT_OTG			0x09
 #define USB_DT_DEBUG			0x0a
 #define USB_DT_INTERFACE_ASSOCIATION	0x0b
+/* these are from the Wireless USB spec */
+#define USB_DT_SECURITY			0x0c
+#define USB_DT_KEY			0x0d
+#define USB_DT_ENCRYPTION_TYPE		0x0e
+#define USB_DT_BOS			0x0f
+#define USB_DT_DEVICE_CAPABILITY	0x10
+#define USB_DT_WIRELESS_ENDPOINT_COMP	0x11
 
 /* conventional codes for class-specific descriptors */
 #define USB_DT_CS_DEVICE		0x21
@@ -192,6 +216,7 @@ struct usb_device_descriptor {
 #define USB_CLASS_CSCID			0x0b	/* chip+ smart card */
 #define USB_CLASS_CONTENT_SEC		0x0d	/* content security */
 #define USB_CLASS_VIDEO			0x0e
+#define USB_CLASS_WIRELESS_CONTROLLER	0xe0
 #define USB_CLASS_APP_SPEC		0xfe
 #define USB_CLASS_VENDOR_SPEC		0xff
 
@@ -223,6 +248,7 @@ struct usb_config_descriptor {
 #define USB_CONFIG_ATT_ONE		(1 << 7)	/* must be set */
 #define USB_CONFIG_ATT_SELFPOWER	(1 << 6)	/* self powered */
 #define USB_CONFIG_ATT_WAKEUP		(1 << 5)	/* can wakeup */
+#define USB_CONFIG_ATT_BATTERY		(1 << 4)	/* battery powered */
 
 /*-------------------------------------------------------------------------*/
 
@@ -289,6 +315,7 @@ struct usb_endpoint_descriptor {
 #define USB_ENDPOINT_XFER_ISOC		1
 #define USB_ENDPOINT_XFER_BULK		2
 #define USB_ENDPOINT_XFER_INT		3
+#define USB_ENDPOINT_MAX_ADJUSTABLE	0x80
 
 
 /*-------------------------------------------------------------------------*/
@@ -350,6 +377,147 @@ struct usb_interface_assoc_descriptor {
 } __attribute__ ((packed));
 
 
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_SECURITY:  group of wireless security descriptors, including
+ * encryption types available for setting up a CC/association.
+ */
+struct usb_security_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__le16 wTotalLength;
+	__u8  bNumEncryptionTypes;
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_KEY:  used with {GET,SET}_SECURITY_DATA; only public keys
+ * may be retrieved.
+ */
+struct usb_key_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  tTKID[3];
+	__u8  bReserved;
+	__u8  bKeyData[0];
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_ENCRYPTION_TYPE:  bundled in DT_SECURITY groups */
+struct usb_encryption_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  bEncryptionType;
+#define	USB_ENC_TYPE_UNSECURE		0
+#define	USB_ENC_TYPE_WIRED		1	/* non-wireless mode */
+#define	USB_ENC_TYPE_CCM_1		2	/* aes128/cbc session */
+#define	USB_ENC_TYPE_RSA_1		3	/* rsa3072/sha1 auth */
+	__u8  bEncryptionValue;		/* use in SET_ENCRYPTION */
+	__u8  bAuthKeyIndex;
+};
+
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_BOS:  group of wireless capabilities */
+struct usb_bos_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__le16 wTotalLength;
+	__u8  bNumDeviceCaps;
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_DEVICE_CAPABILITY:  grouped with BOS */
+struct usb_dev_cap_header {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDevCapabilityType;
+};
+
+#define	USB_CAP_TYPE_WIRELESS_USB	1
+
+struct usb_wireless_cap_descriptor {	/* Ultra Wide Band */
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDevCapabilityType;
+
+	__u8  bmAttributes;
+#define	USB_WIRELESS_P2P_DRD		(1 << 1)
+#define	USB_WIRELESS_BEACON_MASK	(3 << 2)
+#define	USB_WIRELESS_BEACON_SELF	(1 << 2)
+#define	USB_WIRELESS_BEACON_DIRECTED	(2 << 2)
+#define	USB_WIRELESS_BEACON_NONE	(3 << 2)
+	__le16 wPHYRates;	/* bit rates, Mbps */
+#define	USB_WIRELESS_PHY_53		(1 << 0)	/* always set */
+#define	USB_WIRELESS_PHY_80		(1 << 1)
+#define	USB_WIRELESS_PHY_107		(1 << 2)	/* always set */
+#define	USB_WIRELESS_PHY_160		(1 << 3)
+#define	USB_WIRELESS_PHY_200		(1 << 4)	/* always set */
+#define	USB_WIRELESS_PHY_320		(1 << 5)
+#define	USB_WIRELESS_PHY_400		(1 << 6)
+#define	USB_WIRELESS_PHY_480		(1 << 7)
+	__u8  bmTFITXPowerInfo;	/* TFI power levels */
+	__u8  bmFFITXPowerInfo;	/* FFI power levels */
+	__le16 bmBandGroup;
+	__u8  bReserved;
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_DT_WIRELESS_ENDPOINT_COMP:  companion descriptor associated with
+ * each endpoint descriptor for a wireless device
+ */
+struct usb_wireless_ep_comp_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+
+	__u8  bMaxBurst;
+	__u8  bMaxSequence;
+	__le16 wMaxStreamDelay;
+	__le16 wOverTheAirPacketSize;
+	__u8  bOverTheAirInterval;
+	__u8  bmCompAttributes;
+#define USB_ENDPOINT_SWITCH_MASK	0x03	/* in bmCompAttributes */
+#define USB_ENDPOINT_SWITCH_NO		0
+#define USB_ENDPOINT_SWITCH_SWITCH	1
+#define USB_ENDPOINT_SWITCH_SCALE	2
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_REQ_SET_HANDSHAKE is a four-way handshake used between a wireless
+ * host and a device for connection set up, mutual authentication, and
+ * exchanging short lived session keys.  The handshake depends on a CC.
+ */
+struct usb_handshake {
+	__u8 bMessageNumber;
+	__u8 bStatus;
+	__u8 tTKID[3];
+	__u8 bReserved;
+	__u8 CDID[16];
+	__u8 nonce[16];
+	__u8 MIC[8];
+};
+
+/*-------------------------------------------------------------------------*/
+
+/* USB_REQ_SET_CONNECTION modifies or revokes a connection context (CC).
+ * A CC may also be set up using non-wireless secure channels (including
+ * wired USB!), and some devices may support CCs with multiple hosts.
+ */
+struct usb_connection_context {
+	__u8 CHID[16];		/* persistent host id */
+	__u8 CDID[16];		/* device id (unique w/in host context) */
+	__u8 CK[16];		/* connection key */
+};
+
 /*-------------------------------------------------------------------------*/
 
 /* USB 2.0 defines three speeds, here's how Linux identifies them */
@@ -357,7 +525,8 @@ struct usb_interface_assoc_descriptor {
 enum usb_device_speed {
 	USB_SPEED_UNKNOWN = 0,			/* enumerating */
 	USB_SPEED_LOW, USB_SPEED_FULL,		/* usb 1.1 */
-	USB_SPEED_HIGH				/* usb 2.0 */
+	USB_SPEED_HIGH,				/* usb 2.0 */
+	USB_SPEED_VARIABLE,			/* wireless (usb 2.5) */
 };
 
 enum usb_device_state {
-- 
cgit v1.2.3-59-g8ed1b


From 8c8709334cec803368a432a33e0f2e116d48fe07 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Mon, 27 Jun 2005 14:36:34 -0700
Subject: [PATCH] ppc32: Remove CONFIG_PMAC_PBOOK

This patch removes CONFIG_PMAC_PBOOK (PowerBook support).  This is now
split into CONFIG_PMAC_MEDIABAY for the actual hotswap bay that some
powerbooks have, CONFIG_PM for power management related code, and just left
out of any CONFIG_* option for some generally useful stuff that can be used
on non-laptops as well.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/platforms/pmac_sleep.S     |   4 +-
 arch/ppc/platforms/pmac_time.c      |   8 +-
 drivers/block/swim3.c               |  10 +-
 drivers/char/misc.c                 |   3 -
 drivers/ide/ppc/pmac.c              |   8 +-
 drivers/ieee1394/ohci1394.c         |  10 +-
 drivers/macintosh/Kconfig           |  35 ++-----
 drivers/macintosh/Makefile          |   2 +-
 drivers/macintosh/adb.c             |  10 +-
 drivers/macintosh/via-pmu.c         |  70 ++++++--------
 drivers/usb/host/ohci-pci.c         |  13 +--
 drivers/video/aty/aty128fb.c        |  14 ---
 drivers/video/chipsfb.c             | 176 +++++++++++++++++++-----------------
 include/linux/pmu.h                 |   6 +-
 sound/oss/dmasound/dmasound_awacs.c |  14 +--
 sound/ppc/awacs.c                   |   8 +-
 sound/ppc/daca.c                    |   6 +-
 sound/ppc/pmac.c                    |  11 ++-
 sound/ppc/pmac.h                    |   2 +-
 sound/ppc/tumbler.c                 |   4 +-
 20 files changed, 193 insertions(+), 221 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/platforms/pmac_sleep.S b/arch/ppc/platforms/pmac_sleep.S
index f459ade1bd63..016a74649155 100644
--- a/arch/ppc/platforms/pmac_sleep.S
+++ b/arch/ppc/platforms/pmac_sleep.S
@@ -46,7 +46,7 @@
 	.section .text
 	.align	5
 
-#if defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ_PMAC)
+#if defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ_PMAC)
 
 /* This gets called by via-pmu.c late during the sleep process.
  * The PMU was already send the sleep command and will shut us down
@@ -382,7 +382,7 @@ turn_on_mmu:
 	isync
 	rfi
 
-#endif /* defined(CONFIG_PMAC_PBOOK) || defined(CONFIG_CPU_FREQ) */
+#endif /* defined(CONFIG_PM) || defined(CONFIG_CPU_FREQ) */
 
 	.section .data
 	.balign	L1_CACHE_LINE_SIZE
diff --git a/arch/ppc/platforms/pmac_time.c b/arch/ppc/platforms/pmac_time.c
index de60ccc7db9f..778ce4fec368 100644
--- a/arch/ppc/platforms/pmac_time.c
+++ b/arch/ppc/platforms/pmac_time.c
@@ -206,7 +206,7 @@ via_calibrate_decr(void)
 	return 1;
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /*
  * Reset the time after a sleep.
  */
@@ -238,7 +238,7 @@ time_sleep_notify(struct pmu_sleep_notifier *self, int when)
 static struct pmu_sleep_notifier time_sleep_notifier __pmacdata = {
 	time_sleep_notify, SLEEP_LEVEL_MISC,
 };
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 /*
  * Query the OF and get the decr frequency.
@@ -251,9 +251,9 @@ pmac_calibrate_decr(void)
 	struct device_node *cpu;
 	unsigned int freq, *fp;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	pmu_register_sleep_notifier(&time_sleep_notifier);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 	/* We assume MacRISC2 machines have correct device-tree
 	 * calibration. That's better since the VIA itself seems
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 5b09cf154ac7..e5f7494c00ee 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -253,7 +253,7 @@ static int floppy_revalidate(struct gendisk *disk);
 static int swim3_add_device(struct device_node *swims);
 int swim3_init(void);
 
-#ifndef CONFIG_PMAC_PBOOK
+#ifndef CONFIG_PMAC_MEDIABAY
 #define check_media_bay(which, what)	1
 #endif
 
@@ -297,9 +297,11 @@ static void do_fd_request(request_queue_t * q)
 	int i;
 	for(i=0;i<floppy_count;i++)
 	{
+#ifdef CONFIG_PMAC_MEDIABAY
 		if (floppy_states[i].media_bay &&
 			check_media_bay(floppy_states[i].media_bay, MB_FD))
 			continue;
+#endif /* CONFIG_PMAC_MEDIABAY */
 		start_request(&floppy_states[i]);
 	}
 	sti();
@@ -856,8 +858,10 @@ static int floppy_ioctl(struct inode *inode, struct file *filp,
 	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+#ifdef CONFIG_PMAC_MEDIABAY
 	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 		return -ENXIO;
+#endif
 
 	switch (cmd) {
 	case FDEJECT:
@@ -881,8 +885,10 @@ static int floppy_open(struct inode *inode, struct file *filp)
 	int n, err = 0;
 
 	if (fs->ref_count == 0) {
+#ifdef CONFIG_PMAC_MEDIABAY
 		if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 			return -ENXIO;
+#endif
 		out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
 		out_8(&sw->control_bic, 0xff);
 		out_8(&sw->mode, 0x95);
@@ -967,8 +973,10 @@ static int floppy_revalidate(struct gendisk *disk)
 	struct swim3 __iomem *sw;
 	int ret, n;
 
+#ifdef CONFIG_PMAC_MEDIABAY
 	if (fs->media_bay && check_media_bay(fs->media_bay, MB_FD))
 		return -ENXIO;
+#endif
 
 	sw = fs->swim3;
 	grab_drive(fs, revalidating, 0);
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index 31cf84d69026..931efd58f87a 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -308,9 +308,6 @@ static int __init misc_init(void)
 #endif
 #ifdef CONFIG_BVME6000
 	rtc_DP8570A_init();
-#endif
-#ifdef CONFIG_PMAC_PBOOK
-	pmu_device_init();
 #endif
 	if (register_chrdev(MISC_MAJOR,"misc",&misc_fops)) {
 		printk("unable to get major %d for misc devices\n",
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index 569f16767442..818380b5fd27 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1324,9 +1324,9 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	/* XXX FIXME: Media bay stuff need re-organizing */
 	if (np->parent && np->parent->name
 	    && strcasecmp(np->parent->name, "media-bay") == 0) {
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PMAC_MEDIABAY
 		media_bay_set_ide_infos(np->parent, pmif->regbase, pmif->irq, hwif->index);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PMAC_MEDIABAY */
 		pmif->mediabay = 1;
 		if (!bidp)
 			pmif->aapl_bus_id = 1;
@@ -1382,10 +1382,10 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
 	       hwif->index, model_name[pmif->kind], pmif->aapl_bus_id,
 	       pmif->mediabay ? " (mediabay)" : "", hwif->irq);
 			
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PMAC_MEDIABAY
 	if (pmif->mediabay && check_media_bay_by_base(pmif->regbase, MB_CD) == 0)
 		hwif->noprobe = 0;
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PMAC_MEDIABAY */
 
 	hwif->sg_max_nents = MAX_DCMDS;
 
diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c
index 36e25ac823dc..b3d3d22fde64 100644
--- a/drivers/ieee1394/ohci1394.c
+++ b/drivers/ieee1394/ohci1394.c
@@ -3538,8 +3538,8 @@ static void ohci1394_pci_remove(struct pci_dev *pdev)
 
 static int ohci1394_pci_resume (struct pci_dev *pdev)
 {
-#ifdef CONFIG_PMAC_PBOOK
-	{
+#ifdef CONFIG_PPC_PMAC
+	if (_machine == _MACH_Pmac) {
 		struct device_node *of_node;
 
 		/* Re-enable 1394 */
@@ -3547,7 +3547,7 @@ static int ohci1394_pci_resume (struct pci_dev *pdev)
 		if (of_node)
 			pmac_call_feature (PMAC_FTR_1394_ENABLE, of_node, 0, 1);
 	}
-#endif
+#endif /* CONFIG_PPC_PMAC */
 
 	pci_enable_device(pdev);
 
@@ -3557,8 +3557,8 @@ static int ohci1394_pci_resume (struct pci_dev *pdev)
 
 static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state)
 {
-#ifdef CONFIG_PMAC_PBOOK
-	{
+#ifdef CONFIG_PPC_PMAC
+	if (_machine == _MACH_Pmac) {
 		struct device_node *of_node;
 
 		/* Disable 1394 */
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index b0ace5bc950c..91691a6c004e 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -86,33 +86,18 @@ config PMAC_SMU
 	  on the "SMU" system control chip which replaces the old PMU.
 	  If you don't know, say Y.
 
-config PMAC_PBOOK
-	bool "Power management support for PowerBooks"
-	depends on ADB_PMU
-	---help---
-	  This provides support for putting a PowerBook to sleep; it also
-	  enables media bay support.  Power management works on the
-	  PB2400/3400/3500, Wallstreet, Lombard, and Bronze PowerBook G3 and
-	  the Titanium Powerbook G4, as well as the iBooks.  You should get
-	  the power management daemon, pmud, to make it work and you must have
-	  the /dev/pmu device (see the pmud README).
-
-	  Get pmud from <ftp://ftp.samba.org/pub/ppclinux/pmud/>.
-
-	  If you have a PowerBook, you should say Y here.
-
-	  You may also want to compile the dma sound driver as a module and
-	  have it autoloaded. The act of removing the module shuts down the
-	  sound hardware for more power savings.
-
-config PM
-	bool
-	depends on PPC_PMAC && ADB_PMU && PMAC_PBOOK
-	default y
-
 config PMAC_APM_EMU
 	tristate "APM emulation"
-	depends on PMAC_PBOOK
+	depends on PPC_PMAC && PPC32 && PM
+
+config PMAC_MEDIABAY
+	bool "Support PowerBook hotswap media bay"
+	depends on PPC_PMAC && PPC32
+	help
+	  This option adds support for older PowerBook's hotswap media bay
+	  that can contains batteries, floppy drives, or IDE devices. PCI
+	  devices are not fully supported in the bay as I never had one to
+	  try with
 
 # made a separate option since backlight may end up beeing used
 # on non-powerbook machines (but only on PMU based ones AFAIK)
diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile
index b3f88a4fcef7..f5ae171dbfef 100644
--- a/drivers/macintosh/Makefile
+++ b/drivers/macintosh/Makefile
@@ -6,7 +6,7 @@
 
 obj-$(CONFIG_PPC_PMAC)		+= macio_asic.o
 
-obj-$(CONFIG_PMAC_PBOOK)	+= mediabay.o
+obj-$(CONFIG_PMAC_MEDIABAY)	+= mediabay.o
 obj-$(CONFIG_MAC_EMUMOUSEBTN)	+= mac_hid.o
 obj-$(CONFIG_INPUT_ADBHID)	+= adbhid.o
 obj-$(CONFIG_ANSLCD)		+= ans-lcd.o
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 493e2afa191c..c0dc1e3fa58b 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -90,7 +90,7 @@ static int sleepy_trackpad;
 static int autopoll_devs;
 int __adb_probe_sync;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 static int adb_notify_sleep(struct pmu_sleep_notifier *self, int when);
 static struct pmu_sleep_notifier adb_sleep_notifier = {
 	adb_notify_sleep,
@@ -320,9 +320,9 @@ int __init adb_init(void)
 		printk(KERN_WARNING "Warning: no ADB interface detected\n");
 		adb_controller = NULL;
 	} else {
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 		pmu_register_sleep_notifier(&adb_sleep_notifier);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 #ifdef CONFIG_PPC
 		if (machine_is_compatible("AAPL,PowerBook1998") ||
 			machine_is_compatible("PowerBook1,1"))
@@ -337,7 +337,7 @@ int __init adb_init(void)
 
 __initcall(adb_init);
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /*
  * notify clients before sleep and reset bus afterwards
  */
@@ -378,7 +378,7 @@ adb_notify_sleep(struct pmu_sleep_notifier *self, int when)
 	}
 	return PBOOK_SLEEP_OK;
 }
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 static int
 do_adb_reset_bus(void)
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 5375df03c6f3..4a0a0ad2d03c 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -155,10 +155,10 @@ static spinlock_t pmu_lock;
 static u8 pmu_intr_mask;
 static int pmu_version;
 static int drop_interrupts;
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 static int option_lid_wakeup = 1;
 static int sleep_in_progress;
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 static unsigned long async_req_locks;
 static unsigned int pmu_irq_stats[11];
 
@@ -168,7 +168,6 @@ static struct proc_dir_entry *proc_pmu_irqstats;
 static struct proc_dir_entry *proc_pmu_options;
 static int option_server_mode;
 
-#ifdef CONFIG_PMAC_PBOOK
 int pmu_battery_count;
 int pmu_cur_battery;
 unsigned int pmu_power_flags;
@@ -176,7 +175,6 @@ struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES];
 static int query_batt_timer = BATTERY_POLLING_COUNT;
 static struct adb_request batt_req;
 static struct proc_dir_entry *proc_pmu_batt[PMU_MAX_BATTERIES];
-#endif /* CONFIG_PMAC_PBOOK */
 
 #if defined(CONFIG_INPUT_ADBHID) && defined(CONFIG_PMAC_BACKLIGHT)
 extern int disable_kernel_backlight;
@@ -210,11 +208,9 @@ static int proc_get_irqstats(char *page, char **start, off_t off,
 static int pmu_set_backlight_level(int level, void* data);
 static int pmu_set_backlight_enable(int on, int level, void* data);
 #endif /* CONFIG_PMAC_BACKLIGHT */
-#ifdef CONFIG_PMAC_PBOOK
 static void pmu_pass_intr(unsigned char *data, int len);
 static int proc_get_batt(char *page, char **start, off_t off,
 			int count, int *eof, void *data);
-#endif /* CONFIG_PMAC_PBOOK */
 static int proc_read_options(char *page, char **start, off_t off,
 			int count, int *eof, void *data);
 static int proc_write_options(struct file *file, const char __user *buffer,
@@ -407,9 +403,7 @@ static int __init via_pmu_start(void)
 
 	bright_req_1.complete = 1;
 	bright_req_2.complete = 1;
-#ifdef CONFIG_PMAC_PBOOK
 	batt_req.complete = 1;
-#endif
 
 #ifdef CONFIG_PPC32
 	if (pmu_kind == PMU_KEYLARGO_BASED)
@@ -468,7 +462,7 @@ static int __init via_pmu_dev_init(void)
 	register_backlight_controller(&pmu_backlight_controller, NULL, "pmu");
 #endif /* CONFIG_PMAC_BACKLIGHT */
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PPC32
   	if (machine_is_compatible("AAPL,3400/2400") ||
   		machine_is_compatible("AAPL,3500")) {
 		int mb = pmac_call_feature(PMAC_FTR_GET_MB_INFO,
@@ -496,20 +490,19 @@ static int __init via_pmu_dev_init(void)
 				pmu_batteries[1].flags |= PMU_BATT_TYPE_SMART;
 		}
 	}
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PPC32 */
+
 	/* Create /proc/pmu */
 	proc_pmu_root = proc_mkdir("pmu", NULL);
 	if (proc_pmu_root) {
-#ifdef CONFIG_PMAC_PBOOK
-		int i;
+		long i;
 
 		for (i=0; i<pmu_battery_count; i++) {
 			char title[16];
-			sprintf(title, "battery_%d", i);
+			sprintf(title, "battery_%ld", i);
 			proc_pmu_batt[i] = create_proc_read_entry(title, 0, proc_pmu_root,
 						proc_get_batt, (void *)i);
 		}
-#endif /* CONFIG_PMAC_PBOOK */
 
 		proc_pmu_info = create_proc_read_entry("info", 0, proc_pmu_root,
 					proc_get_info, NULL);
@@ -629,8 +622,6 @@ static void pmu_set_server_mode(int server_mode)
 	pmu_wait_complete(&req);
 }
 
-#ifdef CONFIG_PMAC_PBOOK
-
 /* This new version of the code for 2400/3400/3500 powerbooks
  * is inspired from the implementation in gkrellm-pmu
  */
@@ -813,8 +804,6 @@ query_battery_state(void)
 			2, PMU_SMART_BATTERY_STATE, pmu_cur_battery+1);
 }
 
-#endif /* CONFIG_PMAC_PBOOK */
-
 static int __pmac
 proc_get_info(char *page, char **start, off_t off,
 		int count, int *eof, void *data)
@@ -823,11 +812,9 @@ proc_get_info(char *page, char **start, off_t off,
 
 	p += sprintf(p, "PMU driver version     : %d\n", PMU_DRIVER_VERSION);
 	p += sprintf(p, "PMU firmware version   : %02x\n", pmu_version);
-#ifdef CONFIG_PMAC_PBOOK
 	p += sprintf(p, "AC Power               : %d\n",
 		((pmu_power_flags & PMU_PWR_AC_PRESENT) != 0));
 	p += sprintf(p, "Battery count          : %d\n", pmu_battery_count);
-#endif /* CONFIG_PMAC_PBOOK */
 
 	return p - page;
 }
@@ -859,12 +846,11 @@ proc_get_irqstats(char *page, char **start, off_t off,
 	return p - page;
 }
 
-#ifdef CONFIG_PMAC_PBOOK
 static int __pmac
 proc_get_batt(char *page, char **start, off_t off,
 		int count, int *eof, void *data)
 {
-	int batnum = (int)data;
+	long batnum = (long)data;
 	char *p = page;
 	
 	p += sprintf(p, "\n");
@@ -883,7 +869,6 @@ proc_get_batt(char *page, char **start, off_t off,
 
 	return p - page;
 }
-#endif /* CONFIG_PMAC_PBOOK */
 
 static int __pmac
 proc_read_options(char *page, char **start, off_t off,
@@ -891,11 +876,11 @@ proc_read_options(char *page, char **start, off_t off,
 {
 	char *p = page;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	if (pmu_kind == PMU_KEYLARGO_BASED &&
 	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
 		p += sprintf(p, "lid_wakeup=%d\n", option_lid_wakeup);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif
 	if (pmu_kind == PMU_KEYLARGO_BASED)
 		p += sprintf(p, "server_mode=%d\n", option_server_mode);
 
@@ -932,12 +917,12 @@ proc_write_options(struct file *file, const char __user *buffer,
 	*(val++) = 0;
 	while(*val == ' ')
 		val++;
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	if (pmu_kind == PMU_KEYLARGO_BASED &&
 	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
 		if (!strcmp(label, "lid_wakeup"))
 			option_lid_wakeup = ((*val) == '1');
-#endif /* CONFIG_PMAC_PBOOK */
+#endif
 	if (pmu_kind == PMU_KEYLARGO_BASED && !strcmp(label, "server_mode")) {
 		int new_value;
 		new_value = ((*val) == '1');
@@ -1432,7 +1417,6 @@ next:
 	}
 	/* Tick interrupt */
 	else if ((1 << pirq) & PMU_INT_TICK) {
-#ifdef CONFIG_PMAC_PBOOK
 		/* Environement or tick interrupt, query batteries */
 		if (pmu_battery_count) {
 			if ((--query_batt_timer) == 0) {
@@ -1447,7 +1431,6 @@ next:
 		pmu_pass_intr(data, len);
 	} else {
 	       pmu_pass_intr(data, len);
-#endif /* CONFIG_PMAC_PBOOK */
 	}
 	goto next;
 }
@@ -2062,7 +2045,7 @@ pmu_i2c_simple_write(int bus, int addr,  u8* data, int len)
 	return -1;
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 
 static LIST_HEAD(sleep_notifiers);
 
@@ -2715,6 +2698,8 @@ powerbook_sleep_3400(void)
 	return 0;
 }
 
+#endif /* CONFIG_PM */
+
 /*
  * Support for /dev/pmu device
  */
@@ -2894,11 +2879,11 @@ static int __pmac
 pmu_ioctl(struct inode * inode, struct file *filp,
 		     u_int cmd, u_long arg)
 {
-	struct pmu_private *pp = filp->private_data;
 	__u32 __user *argp = (__u32 __user *)arg;
-	int error;
+	int error = -EINVAL;
 
 	switch (cmd) {
+#ifdef CONFIG_PM
 	case PMU_IOC_SLEEP:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
@@ -2920,12 +2905,13 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 			error = -ENOSYS;
 		}
 		sleep_in_progress = 0;
-		return error;
+		break;
 	case PMU_IOC_CAN_SLEEP:
 		if (pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) < 0)
 			return put_user(0, argp);
 		else
 			return put_user(1, argp);
+#endif /* CONFIG_PM */
 
 #ifdef CONFIG_PMAC_BACKLIGHT
 	/* Backlight should have its own device or go via
@@ -2946,11 +2932,13 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 		error = get_user(value, argp);
 		if (!error)
 			error = set_backlight_level(value);
-		return error;
+		break;
 	}
 #ifdef CONFIG_INPUT_ADBHID
 	case PMU_IOC_GRAB_BACKLIGHT: {
+		struct pmu_private *pp = filp->private_data;
 		unsigned long flags;
+
 		if (pp->backlight_locker)
 			return 0;
 		pp->backlight_locker = 1;
@@ -2966,7 +2954,7 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 	case PMU_IOC_HAS_ADB:
 		return put_user(pmu_has_adb, argp);
 	}
-	return -EINVAL;
+	return error;
 }
 
 static struct file_operations pmu_device_fops __pmacdata = {
@@ -2982,14 +2970,16 @@ static struct miscdevice pmu_device __pmacdata = {
 	PMU_MINOR, "pmu", &pmu_device_fops
 };
 
-void pmu_device_init(void)
+static int pmu_device_init(void)
 {
 	if (!via)
-		return;
+		return 0;
 	if (misc_register(&pmu_device) < 0)
 		printk(KERN_ERR "via-pmu: cannot register misc device.\n");
+	return 0;
 }
-#endif /* CONFIG_PMAC_PBOOK */
+device_initcall(pmu_device_init);
+
 
 #ifdef DEBUG_SLEEP
 static inline void  __pmac
@@ -3157,12 +3147,12 @@ EXPORT_SYMBOL(pmu_i2c_combined_read);
 EXPORT_SYMBOL(pmu_i2c_stdsub_write);
 EXPORT_SYMBOL(pmu_i2c_simple_read);
 EXPORT_SYMBOL(pmu_i2c_simple_write);
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 EXPORT_SYMBOL(pmu_register_sleep_notifier);
 EXPORT_SYMBOL(pmu_unregister_sleep_notifier);
 EXPORT_SYMBOL(pmu_enable_irled);
 EXPORT_SYMBOL(pmu_battery_count);
 EXPORT_SYMBOL(pmu_batteries);
 EXPORT_SYMBOL(pmu_power_flags);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index 57fd07d00549..eede6be098d2 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c
@@ -14,14 +14,11 @@
  * This file is licenced under the GPL.
  */
  
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PPC_PMAC
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
 #include <asm/pci-bridge.h>
 #include <asm/prom.h>
-#ifndef CONFIG_PM
-#	define CONFIG_PM
-#endif
 #endif
 
 #ifndef CONFIG_PCI
@@ -132,7 +129,7 @@ static int ohci_pci_suspend (struct usb_hcd *hcd, pm_message_t message)
 	/* let things settle down a bit */
 	msleep (100);
 	
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PPC_PMAC
 	if (_machine == _MACH_Pmac) {
 	   	struct device_node	*of_node;
  
@@ -141,7 +138,7 @@ static int ohci_pci_suspend (struct usb_hcd *hcd, pm_message_t message)
 		if (of_node)
 			pmac_call_feature(PMAC_FTR_USB_ENABLE, of_node, 0, 0);
 	}
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PPC_PMAC */
 	return 0;
 }
 
@@ -151,7 +148,7 @@ static int ohci_pci_resume (struct usb_hcd *hcd)
 	struct ohci_hcd		*ohci = hcd_to_ohci (hcd);
 	int			retval = 0;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PPC_PMAC
 	if (_machine == _MACH_Pmac) {
 		struct device_node *of_node;
 
@@ -160,7 +157,7 @@ static int ohci_pci_resume (struct usb_hcd *hcd)
 		if (of_node)
 			pmac_call_feature (PMAC_FTR_USB_ENABLE, of_node, 0, 1);
 	}
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PPC_PMAC */
 
 	/* resume root hub */
 	if (time_before (jiffies, ohci->next_statechange))
diff --git a/drivers/video/aty/aty128fb.c b/drivers/video/aty/aty128fb.c
index 9789115980a5..7bc1d44d8814 100644
--- a/drivers/video/aty/aty128fb.c
+++ b/drivers/video/aty/aty128fb.c
@@ -350,10 +350,8 @@ static int default_vmode __initdata = VMODE_1024_768_60;
 static int default_cmode __initdata = CMODE_8;
 #endif
 
-#ifdef CONFIG_PMAC_PBOOK
 static int default_crt_on __initdata = 0;
 static int default_lcd_on __initdata = 1;
-#endif
 
 #ifdef CONFIG_MTRR
 static int mtrr = 1;
@@ -1249,7 +1247,6 @@ static int aty128_crtc_to_var(const struct aty128_crtc *crtc,
 	return 0;
 }
 
-#ifdef CONFIG_PMAC_PBOOK
 static void aty128_set_crt_enable(struct aty128fb_par *par, int on)
 {
 	if (on) {
@@ -1284,7 +1281,6 @@ static void aty128_set_lcd_enable(struct aty128fb_par *par, int on)
 		aty_st_le32(LVDS_GEN_CNTL, reg);
 	}
 }
-#endif /* CONFIG_PMAC_PBOOK */
 
 static void aty128_set_pll(struct aty128_pll *pll, const struct aty128fb_par *par)
 {
@@ -1491,12 +1487,10 @@ static int aty128fb_set_par(struct fb_info *info)
 	info->fix.visual = par->crtc.bpp == 8 ? FB_VISUAL_PSEUDOCOLOR
 		: FB_VISUAL_DIRECTCOLOR;
 
-#ifdef CONFIG_PMAC_PBOOK
 	if (par->chip_gen == rage_M3) {
 		aty128_set_crt_enable(par, par->crt_on);
 		aty128_set_lcd_enable(par, par->lcd_on);
 	}
-#endif
 	if (par->accel_flags & FB_ACCELF_TEXT)
 		aty128_init_engine(par);
 
@@ -1652,7 +1646,6 @@ static int __init aty128fb_setup(char *options)
 		return 0;
 
 	while ((this_opt = strsep(&options, ",")) != NULL) {
-#ifdef CONFIG_PMAC_PBOOK
 		if (!strncmp(this_opt, "lcd:", 4)) {
 			default_lcd_on = simple_strtoul(this_opt+4, NULL, 0);
 			continue;
@@ -1660,7 +1653,6 @@ static int __init aty128fb_setup(char *options)
 			default_crt_on = simple_strtoul(this_opt+4, NULL, 0);
 			continue;
 		}
-#endif
 #ifdef CONFIG_MTRR
 		if(!strncmp(this_opt, "nomtrr", 6)) {
 			mtrr = 0;
@@ -1752,10 +1744,8 @@ static int __init aty128_init(struct pci_dev *pdev, const struct pci_device_id *
 	info->fbops = &aty128fb_ops;
 	info->flags = FBINFO_FLAG_DEFAULT;
 
-#ifdef CONFIG_PMAC_PBOOK
 	par->lcd_on = default_lcd_on;
 	par->crt_on = default_crt_on;
-#endif
 
 	var = default_var;
 #ifdef CONFIG_PPC_PMAC
@@ -2035,12 +2025,10 @@ static int aty128fb_blank(int blank, struct fb_info *fb)
 
 	aty_st_8(CRTC_EXT_CNTL+1, state);
 
-#ifdef CONFIG_PMAC_PBOOK
 	if (par->chip_gen == rage_M3) {
 		aty128_set_crt_enable(par, par->crt_on && !blank);
 		aty128_set_lcd_enable(par, par->lcd_on && !blank);
 	}
-#endif	
 #ifdef CONFIG_PMAC_BACKLIGHT
 	if ((_machine == _MACH_Pmac) && !blank)
 		set_backlight_enable(1);
@@ -2124,7 +2112,6 @@ static int aty128fb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd,
 			  u_long arg, struct fb_info *info)
 {
-#ifdef CONFIG_PMAC_PBOOK
 	struct aty128fb_par *par = info->par;
 	u32 value;
 	int rc;
@@ -2149,7 +2136,6 @@ static int aty128fb_ioctl(struct inode *inode, struct file *file, u_int cmd,
 		value = (par->crt_on << 1) | par->lcd_on;
 		return put_user(value, (__u32 __user *)arg);
 	}
-#endif
 	return -EINVAL;
 }
 
diff --git a/drivers/video/chipsfb.c b/drivers/video/chipsfb.c
index 95e72550d43f..e75a965ec760 100644
--- a/drivers/video/chipsfb.c
+++ b/drivers/video/chipsfb.c
@@ -28,22 +28,17 @@
 #include <linux/fb.h>
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/console.h>
 #include <asm/io.h>
 
 #ifdef CONFIG_PMAC_BACKLIGHT
 #include <asm/backlight.h>
 #endif
-#ifdef CONFIG_PMAC_PBOOK
-#include <linux/adb.h>
-#include <linux/pmu.h>
-#endif
 
 /*
  * Since we access the display with inb/outb to fixed port numbers,
  * we can only handle one 6555x chip.  -- paulus
  */
-static struct fb_info chipsfb_info;
-
 #define write_ind(num, val, ap, dp)	do { \
 	outb((num), (ap)); outb((val), (dp)); \
 } while (0)
@@ -74,14 +69,6 @@ static struct fb_info chipsfb_info;
 	inb(0x3da); read_ind(num, var, 0x3c0, 0x3c1); \
 } while (0)
 
-#ifdef CONFIG_PMAC_PBOOK
-static unsigned char *save_framebuffer;
-int chips_sleep_notify(struct pmu_sleep_notifier *self, int when);
-static struct pmu_sleep_notifier chips_sleep_notifier = {
-	chips_sleep_notify, SLEEP_LEVEL_VIDEO,
-};
-#endif
-
 /*
  * Exported functions
  */
@@ -356,6 +343,8 @@ static struct fb_var_screeninfo chipsfb_var __initdata = {
 
 static void __init init_chips(struct fb_info *p, unsigned long addr)
 {
+	memset(p->screen_base, 0, 0x100000);
+
 	p->fix = chipsfb_fix;
 	p->fix.smem_start = addr;
 
@@ -366,34 +355,41 @@ static void __init init_chips(struct fb_info *p, unsigned long addr)
 
 	fb_alloc_cmap(&p->cmap, 256, 0);
 
-	if (register_framebuffer(p) < 0) {
-		printk(KERN_ERR "C&T 65550 framebuffer failed to register\n");
-		return;
-	}
-
-	printk(KERN_INFO "fb%d: Chips 65550 frame buffer (%dK RAM detected)\n",
-		p->node, p->fix.smem_len / 1024);
-
 	chips_hw_init();
 }
 
 static int __devinit
 chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 {
-	struct fb_info *p = &chipsfb_info;
+	struct fb_info *p;
 	unsigned long addr, size;
 	unsigned short cmd;
+	int rc = -ENODEV;
+
+	if (pci_enable_device(dp) < 0) {
+		dev_err(&dp->dev, "Cannot enable PCI device\n");
+		goto err_out;
+	}
 
 	if ((dp->resource[0].flags & IORESOURCE_MEM) == 0)
-		return -ENODEV;
+		goto err_disable;
 	addr = pci_resource_start(dp, 0);
 	size = pci_resource_len(dp, 0);
 	if (addr == 0)
-		return -ENODEV;
-	if (p->screen_base != 0)
-		return -EBUSY;
-	if (!request_mem_region(addr, size, "chipsfb"))
-		return -EBUSY;
+		goto err_disable;
+
+	p = framebuffer_alloc(0, &dp->dev);
+	if (p == NULL) {
+		dev_err(&dp->dev, "Cannot allocate framebuffer structure\n");
+		rc = -ENOMEM;
+		goto err_disable;
+	}
+
+	if (pci_request_region(dp, 0, "chipsfb") != 0) {
+		dev_err(&dp->dev, "Cannot request framebuffer\n");
+		rc = -EBUSY;
+		goto err_release_fb;
+	}
 
 #ifdef __BIG_ENDIAN
 	addr += 0x800000;	// Use big-endian aperture
@@ -411,37 +407,89 @@ chipsfb_pci_init(struct pci_dev *dp, const struct pci_device_id *ent)
 	set_backlight_enable(1);
 #endif /* CONFIG_PMAC_BACKLIGHT */
 
+#ifdef CONFIG_PPC
 	p->screen_base = __ioremap(addr, 0x200000, _PAGE_NO_CACHE);
+#else
+	p->screen_base = ioremap(addr, 0x200000);
+#endif
 	if (p->screen_base == NULL) {
-		release_mem_region(addr, size);
-		return -ENOMEM;
+		dev_err(&dp->dev, "Cannot map framebuffer\n");
+		rc = -ENOMEM;
+		goto err_release_pci;
 	}
+
+	pci_set_drvdata(dp, p);
 	p->device = &dp->dev;
+
 	init_chips(p, addr);
 
-#ifdef CONFIG_PMAC_PBOOK
-	pmu_register_sleep_notifier(&chips_sleep_notifier);
-#endif /* CONFIG_PMAC_PBOOK */
+	if (register_framebuffer(p) < 0) {
+		dev_err(&dp->dev,"C&T 65550 framebuffer failed to register\n");
+		goto err_unmap;
+	}
+
+	dev_info(&dp->dev,"fb%d: Chips 65550 frame buffer"
+		 " (%dK RAM detected)\n",
+		 p->node, p->fix.smem_len / 1024);
 
-	pci_set_drvdata(dp, p);
 	return 0;
+
+ err_unmap:
+	iounmap(p->screen_base);
+ err_release_pci:
+	pci_release_region(dp, 0);
+ err_release_fb:
+	framebuffer_release(p);
+ err_disable:
+ err_out:
+	return rc;
 }
 
 static void __devexit chipsfb_remove(struct pci_dev *dp)
 {
 	struct fb_info *p = pci_get_drvdata(dp);
 
-	if (p != &chipsfb_info || p->screen_base == NULL)
+	if (p->screen_base == NULL)
 		return;
 	unregister_framebuffer(p);
 	iounmap(p->screen_base);
 	p->screen_base = NULL;
-	release_mem_region(pci_resource_start(dp, 0), pci_resource_len(dp, 0));
+	pci_release_region(dp, 0);
+}
+
+#ifdef CONFIG_PM
+static int chipsfb_pci_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+        struct fb_info *p = pci_get_drvdata(pdev);
+
+	if (state == pdev->dev.power.power_state)
+		return 0;
+	if (state != PM_SUSPEND_MEM)
+		goto done;
+
+	acquire_console_sem();
+	chipsfb_blank(1, p);
+	fb_set_suspend(p, 1);
+	release_console_sem();
+ done:
+	pdev->dev.power.power_state = state;
+	return 0;
+}
+
+static int chipsfb_pci_resume(struct pci_dev *pdev)
+{
+        struct fb_info *p = pci_get_drvdata(pdev);
 
-#ifdef CONFIG_PMAC_PBOOK
-	pmu_unregister_sleep_notifier(&chips_sleep_notifier);
-#endif /* CONFIG_PMAC_PBOOK */
+	acquire_console_sem();
+	fb_set_suspend(p, 0);
+	chipsfb_blank(0, p);
+	release_console_sem();
+
+	pdev->dev.power.power_state = PMSG_ON;
+	return 0;
 }
+#endif /* CONFIG_PM */
+
 
 static struct pci_device_id chipsfb_pci_tbl[] = {
 	{ PCI_VENDOR_ID_CT, PCI_DEVICE_ID_CT_65550, PCI_ANY_ID, PCI_ANY_ID },
@@ -455,6 +503,10 @@ static struct pci_driver chipsfb_driver = {
 	.id_table =	chipsfb_pci_tbl,
 	.probe =	chipsfb_pci_init,
 	.remove =	__devexit_p(chipsfb_remove),
+#ifdef CONFIG_PM
+	.suspend =	chipsfb_pci_suspend,
+	.resume =	chipsfb_pci_resume,
+#endif
 };
 
 int __init chips_init(void)
@@ -472,48 +524,4 @@ static void __exit chipsfb_exit(void)
 	pci_unregister_driver(&chipsfb_driver);
 }
 
-#ifdef CONFIG_PMAC_PBOOK
-/*
- * Save the contents of the frame buffer when we go to sleep,
- * and restore it when we wake up again.
- */
-int
-chips_sleep_notify(struct pmu_sleep_notifier *self, int when)
-{
-	struct fb_info *p = &chipsfb_info;
-	int nb = p->var.yres * p->fix.line_length;
-
-	if (p->screen_base == NULL)
-		return PBOOK_SLEEP_OK;
-
-	switch (when) {
-	case PBOOK_SLEEP_REQUEST:
-		save_framebuffer = vmalloc(nb);
-		if (save_framebuffer == NULL)
-			return PBOOK_SLEEP_REFUSE;
-		break;
-	case PBOOK_SLEEP_REJECT:
-		if (save_framebuffer) {
-			vfree(save_framebuffer);
-			save_framebuffer = NULL;
-		}
-		break;
-	case PBOOK_SLEEP_NOW:
-		chipsfb_blank(1, p);
-		if (save_framebuffer)
-			memcpy(save_framebuffer, p->screen_base, nb);
-		break;
-	case PBOOK_WAKE:
-		if (save_framebuffer) {
-			memcpy(p->screen_base, save_framebuffer, nb);
-			vfree(save_framebuffer);
-			save_framebuffer = NULL;
-		}
-		chipsfb_blank(0, p);
-		break;
-	}
-	return PBOOK_SLEEP_OK;
-}
-#endif /* CONFIG_PMAC_PBOOK */
-
 MODULE_LICENSE("GPL");
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 6d73eada277e..373bd3b9b330 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -166,7 +166,7 @@ extern int pmu_i2c_simple_read(int bus, int addr,  u8* data, int len);
 extern int pmu_i2c_simple_write(int bus, int addr,  u8* data, int len);
 
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /*
  * Stuff for putting the powerbook to sleep and waking it again.
  *
@@ -208,6 +208,8 @@ struct pmu_sleep_notifier
 int pmu_register_sleep_notifier(struct pmu_sleep_notifier* notifier);
 int pmu_unregister_sleep_notifier(struct pmu_sleep_notifier* notifier);
 
+#endif /* CONFIG_PM */
+
 #define PMU_MAX_BATTERIES	2
 
 /* values for pmu_power_flags */
@@ -235,6 +237,4 @@ extern int pmu_battery_count;
 extern struct pmu_battery_info pmu_batteries[PMU_MAX_BATTERIES];
 extern unsigned int pmu_power_flags;
 
-#endif /* CONFIG_PMAC_PBOOK */
-
 #endif	/* __KERNEL__ */
diff --git a/sound/oss/dmasound/dmasound_awacs.c b/sound/oss/dmasound/dmasound_awacs.c
index 33108661e671..2704e1598add 100644
--- a/sound/oss/dmasound/dmasound_awacs.c
+++ b/sound/oss/dmasound/dmasound_awacs.c
@@ -255,7 +255,7 @@ static int awacs_burgundy_read_mvolume(unsigned address);
 
 static volatile struct dbdma_cmd *emergency_dbdma_cmd;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /*
  * Stuff for restoring after a sleep.
  */
@@ -263,7 +263,7 @@ static int awacs_sleep_notify(struct pmu_sleep_notifier *self, int when);
 struct pmu_sleep_notifier awacs_sleep_notifier = {
 	awacs_sleep_notify, SLEEP_LEVEL_SOUND,
 };
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 /* for (soft) sample rate translations */
 int expand_bal;		/* Balance factor for expanding (not volume!) */
@@ -675,7 +675,7 @@ static void PMacIrqCleanup(void)
 	kfree(awacs_rx_cmd_space);
 	kfree(beep_dbdma_cmd_space);
 	kfree(beep_buf);
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	pmu_unregister_sleep_notifier(&awacs_sleep_notifier);
 #endif
 }
@@ -1415,7 +1415,7 @@ load_awacs(void)
 	}
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /*
  * Save state when going to sleep, restore it afterwards.
  */
@@ -1551,7 +1551,7 @@ static int awacs_sleep_notify(struct pmu_sleep_notifier *self, int when)
 	}
 	return PBOOK_SLEEP_OK;
 }
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 
 /* All the burgundy functions: */
@@ -3053,9 +3053,9 @@ printk("dmasound_pmac: Awacs/Screamer Codec Mfct: %d Rev %d\n", mfg, rev);
 	if ((res=setup_beep()))
 		return res ;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	pmu_register_sleep_notifier(&awacs_sleep_notifier);
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 	/* Powerbooks have odd ways of enabling inputs such as
 	   an expansion-bay CD or sound from an internal modem
diff --git a/sound/ppc/awacs.c b/sound/ppc/awacs.c
index e052bd071e5b..061e52d3d771 100644
--- a/sound/ppc/awacs.c
+++ b/sound/ppc/awacs.c
@@ -90,7 +90,7 @@ snd_pmac_awacs_write_noreg(pmac_t *chip, int reg, int val)
 	snd_pmac_awacs_write(chip, val | (reg << 12));
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /* Recalibrate chip */
 static void screamer_recalibrate(pmac_t *chip)
 {
@@ -642,7 +642,7 @@ static void awacs_restore_all_regs(pmac_t *chip)
 	}
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 static void snd_pmac_awacs_suspend(pmac_t *chip)
 {
 	snd_pmac_awacs_write_noreg(chip, 1, (chip->awacs_reg[1]
@@ -676,7 +676,7 @@ static void snd_pmac_awacs_resume(pmac_t *chip)
 	}
 #endif
 }
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 #ifdef PMAC_SUPPORT_AUTOMUTE
 /*
@@ -883,7 +883,7 @@ snd_pmac_awacs_init(pmac_t *chip)
 	 * set lowlevel callbacks
 	 */
 	chip->set_format = snd_pmac_awacs_set_format;
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	chip->suspend = snd_pmac_awacs_suspend;
 	chip->resume = snd_pmac_awacs_resume;
 #endif
diff --git a/sound/ppc/daca.c b/sound/ppc/daca.c
index f24a91693616..a737f298e77d 100644
--- a/sound/ppc/daca.c
+++ b/sound/ppc/daca.c
@@ -218,7 +218,7 @@ static snd_kcontrol_new_t daca_mixers[] = {
 };
 
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 static void daca_resume(pmac_t *chip)
 {
 	pmac_daca_t *mix = chip->mixer_data;
@@ -227,7 +227,7 @@ static void daca_resume(pmac_t *chip)
 				  mix->amp_on ? 0x05 : 0x04);
 	daca_set_volume(mix);
 }
-#endif /* CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
 
 
 static void daca_cleanup(pmac_t *chip)
@@ -275,7 +275,7 @@ int __init snd_pmac_daca_init(pmac_t *chip)
 			return err;
 	}
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	chip->resume = daca_resume;
 #endif
 
diff --git a/sound/ppc/pmac.c b/sound/ppc/pmac.c
index 080ef3928465..75b8b7423036 100644
--- a/sound/ppc/pmac.c
+++ b/sound/ppc/pmac.c
@@ -36,7 +36,7 @@
 #include <asm/pci-bridge.h>
 
 
-#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK)
+#ifdef CONFIG_PM
 static int snd_pmac_register_sleep_notifier(pmac_t *chip);
 static int snd_pmac_unregister_sleep_notifier(pmac_t *chip);
 static int snd_pmac_suspend(snd_card_t *card, pm_message_t state);
@@ -782,7 +782,7 @@ static int snd_pmac_free(pmac_t *chip)
 	}
 
 	snd_pmac_sound_feature(chip, 0);
-#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK)
+#ifdef CONFIG_PM
 	snd_pmac_unregister_sleep_notifier(chip);
 #endif
 
@@ -1292,7 +1292,7 @@ int __init snd_pmac_new(snd_card_t *card, pmac_t **chip_return)
 	/* Reset dbdma channels */
 	snd_pmac_dbdma_reset(chip);
 
-#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK)
+#ifdef CONFIG_PM
 	/* add sleep notifier */
 	if (! snd_pmac_register_sleep_notifier(chip))
 		snd_card_set_pm_callback(chip->card, snd_pmac_suspend, snd_pmac_resume, chip);
@@ -1316,7 +1316,7 @@ int __init snd_pmac_new(snd_card_t *card, pmac_t **chip_return)
  * sleep notify for powerbook
  */
 
-#if defined(CONFIG_PM) && defined(CONFIG_PMAC_PBOOK)
+#ifdef CONFIG_PM
 
 /*
  * Save state when going to sleep, restore it afterwards.
@@ -1414,4 +1414,5 @@ static int snd_pmac_unregister_sleep_notifier(pmac_t *chip)
 	return 0;
 }
 
-#endif /* CONFIG_PM && CONFIG_PMAC_PBOOK */
+#endif /* CONFIG_PM */
+
diff --git a/sound/ppc/pmac.h b/sound/ppc/pmac.h
index 0a84c05f714b..582db5220119 100644
--- a/sound/ppc/pmac.h
+++ b/sound/ppc/pmac.h
@@ -167,7 +167,7 @@ struct snd_pmac {
 	void (*set_format)(pmac_t *chip);
 	void (*update_automute)(pmac_t *chip, int do_notify);
 	int (*detect_headphone)(pmac_t *chip);
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	void (*suspend)(pmac_t *chip);
 	void (*resume)(pmac_t *chip);
 #endif
diff --git a/sound/ppc/tumbler.c b/sound/ppc/tumbler.c
index 9332237cb6a4..36c5d5d45bb1 100644
--- a/sound/ppc/tumbler.c
+++ b/sound/ppc/tumbler.c
@@ -1128,7 +1128,7 @@ static void tumbler_reset_audio(pmac_t *chip)
 	}
 }
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 /* suspend mixer */
 static void tumbler_suspend(pmac_t *chip)
 {
@@ -1370,7 +1370,7 @@ int __init snd_pmac_tumbler_init(pmac_t *chip)
 	if ((err = snd_ctl_add(chip->card, chip->drc_sw_ctl)) < 0)
 		return err;
 
-#ifdef CONFIG_PMAC_PBOOK
+#ifdef CONFIG_PM
 	chip->suspend = tumbler_suspend;
 	chip->resume = tumbler_resume;
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From ffaa8bd6c904d1ab79b677905067349a5ff51d84 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Mon, 27 Jun 2005 14:36:36 -0700
Subject: [PATCH] seccomp: tsc disable

I believe at least for seccomp it's worth to turn off the tsc, not just for
HT but for the L2 cache too.  So it's up to you, either you turn it off
completely (which isn't very nice IMHO) or I recommend to apply this below
patch.

This has been tested successfully on x86-64 against current cogito
repository (i686 compiles so I didn't bother testing ;).  People selling
the cpu through cpushare may appreciate this bit for a peace of mind.

There's no way to get any timing info anymore with this applied
(gettimeofday is forbidden of course).  The seccomp environment is
completely deterministic so it can't be allowed to get timing info, it has
to be deterministic so in the future I can enable a computing mode that
does a parallel computing for each task with server side transparent
checkpointing and verification that the output is the same from all the 2/3
seller computers for each task, without the buyer even noticing (for now
the verification is left to the buyer client side and there's no
checkpointing, since that would require more kernel changes to track the
dirty bits but it'll be easy to extend once the basic mode is finished).

Eliminating a cold-cache read of the cr4 global variable will save one
cacheline during the tlb flush while making the code per-cpu-safe at the
same time.  Thanks to Mikael Pettersson for noticing the tlb flush wasn't
per-cpu-safe.

The global tlb flush can run from irq (IPI calling do_flush_tlb_all) but
it'll be transparent to the switch_to code since the IPI won't make any
change to the cr4 contents from the point of view of the interrupted code
and since it's now all per-cpu stuff, it will not race.  So no need to
disable irqs in switch_to slow path.

Signed-off-by: Andrea Arcangeli <andrea@cpushare.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c    | 29 +++++++++++++++++++++++++++++
 arch/x86_64/kernel/process.c  | 29 +++++++++++++++++++++++++++++
 include/asm-i386/tlbflush.h   | 12 +++++++-----
 include/asm-x86_64/tlbflush.h | 12 +++++++-----
 include/linux/seccomp.h       | 10 ++++++++++
 5 files changed, 82 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 5f8cfa6b7940..ba243a4cc119 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -616,6 +616,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
 	tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
 }
 
+/*
+ * This function selects if the context switch from prev to next
+ * has to tweak the TSC disable bit in the cr4.
+ */
+static inline void disable_tsc(struct task_struct *prev_p,
+			       struct task_struct *next_p)
+{
+	struct thread_info *prev, *next;
+
+	/*
+	 * gcc should eliminate the ->thread_info dereference if
+	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
+	 */
+	prev = prev_p->thread_info;
+	next = next_p->thread_info;
+
+	if (has_secure_computing(prev) || has_secure_computing(next)) {
+		/* slow path here */
+		if (has_secure_computing(prev) &&
+		    !has_secure_computing(next)) {
+			write_cr4(read_cr4() & ~X86_CR4_TSD);
+		} else if (!has_secure_computing(prev) &&
+			   has_secure_computing(next))
+			write_cr4(read_cr4() | X86_CR4_TSD);
+	}
+}
+
 /*
  *	switch_to(x,yn) should switch tasks from x to y.
  *
@@ -695,6 +722,8 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
 		handle_io_bitmap(next, tss);
 
+	disable_tsc(prev_p, next_p);
+
 	return prev_p;
 }
 
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 1d91271796e5..7577f9d7a75d 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -481,6 +481,33 @@ out:
 	return err;
 }
 
+/*
+ * This function selects if the context switch from prev to next
+ * has to tweak the TSC disable bit in the cr4.
+ */
+static inline void disable_tsc(struct task_struct *prev_p,
+			       struct task_struct *next_p)
+{
+	struct thread_info *prev, *next;
+
+	/*
+	 * gcc should eliminate the ->thread_info dereference if
+	 * has_secure_computing returns 0 at compile time (SECCOMP=n).
+	 */
+	prev = prev_p->thread_info;
+	next = next_p->thread_info;
+
+	if (has_secure_computing(prev) || has_secure_computing(next)) {
+		/* slow path here */
+		if (has_secure_computing(prev) &&
+		    !has_secure_computing(next)) {
+			write_cr4(read_cr4() & ~X86_CR4_TSD);
+		} else if (!has_secure_computing(prev) &&
+			   has_secure_computing(next))
+			write_cr4(read_cr4() | X86_CR4_TSD);
+	}
+}
+
 /*
  * This special macro can be used to load a debugging register
  */
@@ -599,6 +626,8 @@ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *
 		}
 	}
 
+	disable_tsc(prev_p, next_p);
+
 	return prev_p;
 }
 
diff --git a/include/asm-i386/tlbflush.h b/include/asm-i386/tlbflush.h
index f22fab0cea26..ab216e1370ef 100644
--- a/include/asm-i386/tlbflush.h
+++ b/include/asm-i386/tlbflush.h
@@ -22,16 +22,18 @@
  */
 #define __flush_tlb_global()						\
 	do {								\
-		unsigned int tmpreg;					\
+		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
 		__asm__ __volatile__(					\
-			"movl %1, %%cr4;  # turn off PGE     \n"	\
+			"movl %%cr4, %2;  # turn off PGE     \n"	\
+			"movl %2, %1;                        \n"	\
+			"andl %3, %1;                        \n"	\
+			"movl %1, %%cr4;                     \n"	\
 			"movl %%cr3, %0;                     \n"	\
 			"movl %0, %%cr3;  # flush TLB        \n"	\
 			"movl %2, %%cr4;  # turn PGE back on \n"	\
-			: "=&r" (tmpreg)				\
-			: "r" (mmu_cr4_features & ~X86_CR4_PGE),	\
-			  "r" (mmu_cr4_features)			\
+			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
+			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
 	} while (0)
 
diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h
index 2e811ac262af..061742382520 100644
--- a/include/asm-x86_64/tlbflush.h
+++ b/include/asm-x86_64/tlbflush.h
@@ -22,16 +22,18 @@
  */
 #define __flush_tlb_global()						\
 	do {								\
-		unsigned long tmpreg;					\
+		unsigned long tmpreg, cr4, cr4_orig;			\
 									\
 		__asm__ __volatile__(					\
-			"movq %1, %%cr4;  # turn off PGE     \n"	\
+			"movq %%cr4, %2;  # turn off PGE     \n"	\
+			"movq %2, %1;                        \n"	\
+			"andq %3, %1;                        \n"	\
+			"movq %1, %%cr4;                     \n"	\
 			"movq %%cr3, %0;  # flush TLB        \n"	\
 			"movq %0, %%cr3;                     \n"	\
 			"movq %2, %%cr4;  # turn PGE back on \n"	\
-			: "=&r" (tmpreg)				\
-			: "r" (mmu_cr4_features & ~X86_CR4_PGE),	\
-			  "r" (mmu_cr4_features)			\
+			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
+			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
 	} while (0)
 
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 3a2702bbb1d6..dc89116bb1ca 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -19,6 +19,11 @@ static inline void secure_computing(int this_syscall)
 		__secure_computing(this_syscall);
 }
 
+static inline int has_secure_computing(struct thread_info *ti)
+{
+	return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP));
+}
+
 #else /* CONFIG_SECCOMP */
 
 #if (__GNUC__ > 2)
@@ -28,6 +33,11 @@ static inline void secure_computing(int this_syscall)
 #endif
 
 #define secure_computing(x) do { } while (0)
+/* static inline to preserve typechecking */
+static inline int has_secure_computing(struct thread_info *ti)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SECCOMP */
 
-- 
cgit v1.2.3-59-g8ed1b


From 3de0a70bd926ff974adb27a38d4fd1049f05e54e Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Mon, 27 Jun 2005 14:36:48 -0700
Subject: [PATCH] cciss: pci id fix

This patch fixes a PCI ID I got wrong before.  It also adds support for
another new SAS controller due out this summer.  I didn't have a marketing
name prior to my last submission.  Also modifies the copyright date range.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cciss.txt | 1 +
 drivers/block/cciss.c   | 9 ++++++---
 include/linux/pci_ids.h | 3 ++-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cciss.txt b/Documentation/cciss.txt
index d599beb9df8a..c8f9a73111da 100644
--- a/Documentation/cciss.txt
+++ b/Documentation/cciss.txt
@@ -17,6 +17,7 @@ This driver is known to work with the following cards:
 	* SA P600
 	* SA P800
 	* SA E400
+	* SA E300
 
 If nodes are not already created in the /dev/cciss directory, run as root:
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index abde27027c06..0cd606ce222a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1,6 +1,6 @@
 /*
  *    Disk Array driver for HP SA 5xxx and 6xxx Controllers
- *    Copyright 2000, 2002 Hewlett-Packard Development Company, L.P.
+ *    Copyright 2000, 2005 Hewlett-Packard Development Company, L.P.
  *
  *    This program is free software; you can redistribute it and/or modify
  *    it under the terms of the GNU General Public License as published by
@@ -54,7 +54,7 @@
 MODULE_AUTHOR("Hewlett-Packard Company");
 MODULE_DESCRIPTION("Driver for HP Controller SA5xxx SA6xxx version 2.6.6");
 MODULE_SUPPORTED_DEVICE("HP SA5i SA5i+ SA532 SA5300 SA5312 SA641 SA642 SA6400"
-			" SA6i P600 P800 E400");
+			" SA6i P600 P800 E400 E300");
 MODULE_LICENSE("GPL");
 
 #include "cciss_cmd.h"
@@ -85,8 +85,10 @@ static const struct pci_device_id cciss_pci_device_id[] = {
 		0x103C, 0x3225, 0, 0, 0},
 	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB,
 		0x103c, 0x3223, 0, 0, 0},
-	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSB,
+	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC,
 		0x103c, 0x3231, 0, 0, 0},
+	{ PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC,
+		0x103c, 0x3233, 0, 0, 0},
 	{0,}
 };
 MODULE_DEVICE_TABLE(pci, cciss_pci_device_id);
@@ -110,6 +112,7 @@ static struct board_type products[] = {
 	{ 0x3225103C, "Smart Array P600", &SA5_access},
 	{ 0x3223103C, "Smart Array P800", &SA5_access},
 	{ 0x3231103C, "Smart Array E400", &SA5_access},
+	{ 0x3233103C, "Smart Array E300", &SA5_access},
 };
 
 /* How long to wait (in millesconds) for board to go into simple mode */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 3af7450278b7..a3961e1d5183 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -712,8 +712,9 @@
 #define PCI_DEVICE_ID_HP_DIVA_AUX	0x1290
 #define PCI_DEVICE_ID_HP_DIVA_RMP3	0x1301
 #define PCI_DEVICE_ID_HP_CISSA		0x3220
-#define PCI_DEVICE_ID_HP_CISSB		0x3230
+#define PCI_DEVICE_ID_HP_CISSB		0x3222
 #define PCI_DEVICE_ID_HP_ZX2_IOC	0x4031
+#define PCI_DEVICE_ID_HP_CISSC		0x3230
 
 #define PCI_VENDOR_ID_PCTECH		0x1042
 #define PCI_DEVICE_ID_PCTECH_RZ1000	0x1000
-- 
cgit v1.2.3-59-g8ed1b


From cd6fb584cf7f18ec6b221192b57d712ecc8c1859 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Mon, 27 Jun 2005 14:36:49 -0700
Subject: [PATCH] cciss: pci domain info pass 2

This is pass 2 of my patch to add pci domain info to an existing ioctl.  This
time I insert the domain between dev_fn and board_id as Willy suggested and
change the var to unsigned short to ease Christoph's concerns.  Although I
thought unsigned int was the correct var type for this.  I also thought it
didn't matter where I inserted it in the structure.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/cciss.c       | 1 +
 include/linux/cciss_ioctl.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 0cd606ce222a..d5d0fa538f12 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -638,6 +638,7 @@ static int cciss_ioctl(struct inode *inode, struct file *filep,
 		cciss_pci_info_struct pciinfo;
 
 		if (!arg) return -EINVAL;
+		pciinfo.domain = pci_domain_nr(host->pdev->bus);
 		pciinfo.bus = host->pdev->bus->number;
 		pciinfo.dev_fn = host->pdev->devfn;
 		pciinfo.board_id = host->board_id;
diff --git a/include/linux/cciss_ioctl.h b/include/linux/cciss_ioctl.h
index ee0c6e8995da..424d5e622b43 100644
--- a/include/linux/cciss_ioctl.h
+++ b/include/linux/cciss_ioctl.h
@@ -10,6 +10,7 @@
 typedef struct _cciss_pci_info_struct
 {
 	unsigned char 	bus;
+	unsigned short	domain;
 	unsigned char 	dev_fn;
 	__u32 		board_id;
 } cciss_pci_info_struct; 
-- 
cgit v1.2.3-59-g8ed1b


From 9ec4b1f356b3bad928ae8e2aa9caebfa737d52df Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 27 Jun 2005 15:17:01 -0700
Subject: [PATCH] kprobes: fix single-step out of line - take2

Now that PPC64 has no-execute support, here is a second try to fix the
single step out of line during kprobe execution.  Kprobes on x86_64 already
solved this problem by allocating an executable page and using it as the
scratch area for stepping out of line.  Reuse that.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/kprobes.c  |  26 ++++++++--
 arch/x86_64/kernel/kprobes.c | 113 +------------------------------------------
 include/asm-ia64/kprobes.h   |   1 +
 include/asm-ppc64/kprobes.h  |   2 +-
 include/linux/kprobes.h      |   2 +
 kernel/kprobes.c             | 101 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 128 insertions(+), 117 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 782ce3efa2c1..86cc5496db9f 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -36,6 +36,8 @@
 #include <asm/kdebug.h>
 #include <asm/sstep.h>
 
+static DECLARE_MUTEX(kprobe_mutex);
+
 static struct kprobe *current_kprobe;
 static unsigned long kprobe_status, kprobe_saved_msr;
 static struct kprobe *kprobe_prev;
@@ -54,6 +56,15 @@ int arch_prepare_kprobe(struct kprobe *p)
 		printk("Cannot register a kprobe on rfid or mtmsrd\n");
 		ret = -EINVAL;
 	}
+
+	/* insn must be on a special executable page on ppc64 */
+	if (!ret) {
+		up(&kprobe_mutex);
+		p->ainsn.insn = get_insn_slot();
+		down(&kprobe_mutex);
+		if (!p->ainsn.insn)
+			ret = -ENOMEM;
+	}
 	return ret;
 }
 
@@ -79,16 +90,22 @@ void arch_disarm_kprobe(struct kprobe *p)
 
 void arch_remove_kprobe(struct kprobe *p)
 {
+	up(&kprobe_mutex);
+	free_insn_slot(p->ainsn.insn);
+	down(&kprobe_mutex);
 }
 
 static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 {
+	kprobe_opcode_t insn = *p->ainsn.insn;
+
 	regs->msr |= MSR_SE;
-	/*single step inline if it a breakpoint instruction*/
-	if (p->opcode == BREAKPOINT_INSTRUCTION)
+
+	/* single step inline if it is a trap variant */
+	if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn))
 		regs->nip = (unsigned long)p->addr;
 	else
-		regs->nip = (unsigned long)&p->ainsn.insn;
+		regs->nip = (unsigned long)p->ainsn.insn;
 }
 
 static inline void save_previous_kprobe(void)
@@ -205,9 +222,10 @@ no_kprobe:
 static void resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
 	int ret;
+	unsigned int insn = *p->ainsn.insn;
 
 	regs->nip = (unsigned long)p->addr;
-	ret = emulate_step(regs, p->ainsn.insn[0]);
+	ret = emulate_step(regs, insn);
 	if (ret == 0)
 		regs->nip = (unsigned long)p->addr + 4;
 }
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 4e680f87a75f..6a1c88376bef 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -38,7 +38,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/preempt.h>
-#include <linux/moduleloader.h>
+
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/kdebug.h>
@@ -51,8 +51,6 @@ static struct kprobe *kprobe_prev;
 static unsigned long kprobe_status_prev, kprobe_old_rflags_prev, kprobe_saved_rflags_prev;
 static struct pt_regs jprobe_saved_regs;
 static long *jprobe_saved_rsp;
-static kprobe_opcode_t *get_insn_slot(void);
-static void free_insn_slot(kprobe_opcode_t *slot);
 void jprobe_return_end(void);
 
 /* copy of the kernel stack at the probe fire time */
@@ -681,112 +679,3 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	}
 	return 0;
 }
-
-/*
- * kprobe->ainsn.insn points to the copy of the instruction to be single-stepped.
- * By default on x86_64, pages we get from kmalloc or vmalloc are not
- * executable.  Single-stepping an instruction on such a page yields an
- * oops.  So instead of storing the instruction copies in their respective
- * kprobe objects, we allocate a page, map it executable, and store all the
- * instruction copies there.  (We can allocate additional pages if somebody
- * inserts a huge number of probes.)  Each page can hold up to INSNS_PER_PAGE
- * instruction slots, each of which is MAX_INSN_SIZE*sizeof(kprobe_opcode_t)
- * bytes.
- */
-#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE*sizeof(kprobe_opcode_t)))
-struct kprobe_insn_page {
-	struct hlist_node hlist;
-	kprobe_opcode_t *insns;		/* page of instruction slots */
-	char slot_used[INSNS_PER_PAGE];
-	int nused;
-};
-
-static struct hlist_head kprobe_insn_pages;
-
-/**
- * get_insn_slot() - Find a slot on an executable page for an instruction.
- * We allocate an executable page if there's no room on existing ones.
- */
-static kprobe_opcode_t *get_insn_slot(void)
-{
-	struct kprobe_insn_page *kip;
-	struct hlist_node *pos;
-
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
-		if (kip->nused < INSNS_PER_PAGE) {
-			int i;
-			for (i = 0; i < INSNS_PER_PAGE; i++) {
-				if (!kip->slot_used[i]) {
-					kip->slot_used[i] = 1;
-					kip->nused++;
-					return kip->insns + (i*MAX_INSN_SIZE);
-				}
-			}
-			/* Surprise!  No unused slots.  Fix kip->nused. */
-			kip->nused = INSNS_PER_PAGE;
-		}
-	}
-
-	/* All out of space.  Need to allocate a new page. Use slot 0.*/
-	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
-	if (!kip) {
-		return NULL;
-	}
-
-	/*
-	 * For the %rip-relative displacement fixups to be doable, we
-	 * need our instruction copy to be within +/- 2GB of any data it
-	 * might access via %rip.  That is, within 2GB of where the
-	 * kernel image and loaded module images reside.  So we allocate
-	 * a page in the module loading area.
-	 */
-	kip->insns = module_alloc(PAGE_SIZE);
-	if (!kip->insns) {
-		kfree(kip);
-		return NULL;
-	}
-	INIT_HLIST_NODE(&kip->hlist);
-	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
-	memset(kip->slot_used, 0, INSNS_PER_PAGE);
-	kip->slot_used[0] = 1;
-	kip->nused = 1;
-	return kip->insns;
-}
-
-/**
- * free_insn_slot() - Free instruction slot obtained from get_insn_slot().
- */
-static void free_insn_slot(kprobe_opcode_t *slot)
-{
-	struct kprobe_insn_page *kip;
-	struct hlist_node *pos;
-
-	hlist_for_each(pos, &kprobe_insn_pages) {
-		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
-		if (kip->insns <= slot
-		    && slot < kip->insns+(INSNS_PER_PAGE*MAX_INSN_SIZE)) {
-			int i = (slot - kip->insns) / MAX_INSN_SIZE;
-			kip->slot_used[i] = 0;
-			kip->nused--;
-			if (kip->nused == 0) {
-				/*
-				 * Page is no longer in use.  Free it unless
-				 * it's the last one.  We keep the last one
-				 * so as not to have to set it up again the
-				 * next time somebody inserts a probe.
-				 */
-				hlist_del(&kip->hlist);
-				if (hlist_empty(&kprobe_insn_pages)) {
-					INIT_HLIST_NODE(&kip->hlist);
-					hlist_add_head(&kip->hlist,
-						&kprobe_insn_pages);
-				} else {
-					module_free(NULL, kip->insns);
-					kfree(kip);
-				}
-			}
-			return;
-		}
-	}
-}
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index 7b700035e36d..25d8b1edfcba 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -28,6 +28,7 @@
 #include <linux/ptrace.h>
 #include <asm/break.h>
 
+#define MAX_INSN_SIZE   16
 #define BREAK_INST	(long)(__IA64_BREAK_KPROBE << 6)
 
 typedef union cmp_inst {
diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h
index 19b468bed059..790cf7c52774 100644
--- a/include/asm-ppc64/kprobes.h
+++ b/include/asm-ppc64/kprobes.h
@@ -45,7 +45,7 @@ typedef unsigned int kprobe_opcode_t;
 /* Architecture specific copy of original instruction */
 struct arch_specific_insn {
 	/* copy of original instruction */
-	kprobe_opcode_t insn[MAX_INSN_SIZE];
+	kprobe_opcode_t *insn;
 };
 
 #ifdef CONFIG_KPROBES
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 5e1a7b0d7b3f..d304d4579856 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -177,6 +177,8 @@ extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
 extern void show_registers(struct pt_regs *regs);
+extern kprobe_opcode_t *get_insn_slot(void);
+extern void free_insn_slot(kprobe_opcode_t *slot);
 
 /* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
 struct kprobe *get_kprobe(void *addr);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 334f37472c56..65242529a75f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,6 +36,7 @@
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleloader.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/kdebug.h>
@@ -50,6 +51,106 @@ unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
 static struct kprobe *curr_kprobe;
 
+/*
+ * kprobe->ainsn.insn points to the copy of the instruction to be
+ * single-stepped. x86_64, POWER4 and above have no-exec support and
+ * stepping on the instruction on a vmalloced/kmalloced/data page
+ * is a recipe for disaster
+ */
+#define INSNS_PER_PAGE	(PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
+
+struct kprobe_insn_page {
+	struct hlist_node hlist;
+	kprobe_opcode_t *insns;		/* Page of instruction slots */
+	char slot_used[INSNS_PER_PAGE];
+	int nused;
+};
+
+static struct hlist_head kprobe_insn_pages;
+
+/**
+ * get_insn_slot() - Find a slot on an executable page for an instruction.
+ * We allocate an executable page if there's no room on existing ones.
+ */
+kprobe_opcode_t *get_insn_slot(void)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->nused < INSNS_PER_PAGE) {
+			int i;
+			for (i = 0; i < INSNS_PER_PAGE; i++) {
+				if (!kip->slot_used[i]) {
+					kip->slot_used[i] = 1;
+					kip->nused++;
+					return kip->insns + (i * MAX_INSN_SIZE);
+				}
+			}
+			/* Surprise!  No unused slots.  Fix kip->nused. */
+			kip->nused = INSNS_PER_PAGE;
+		}
+	}
+
+	/* All out of space.  Need to allocate a new page. Use slot 0.*/
+	kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
+	if (!kip) {
+		return NULL;
+	}
+
+	/*
+	 * Use module_alloc so this page is within +/- 2GB of where the
+	 * kernel image and loaded module images reside. This is required
+	 * so x86_64 can correctly handle the %rip-relative fixups.
+	 */
+	kip->insns = module_alloc(PAGE_SIZE);
+	if (!kip->insns) {
+		kfree(kip);
+		return NULL;
+	}
+	INIT_HLIST_NODE(&kip->hlist);
+	hlist_add_head(&kip->hlist, &kprobe_insn_pages);
+	memset(kip->slot_used, 0, INSNS_PER_PAGE);
+	kip->slot_used[0] = 1;
+	kip->nused = 1;
+	return kip->insns;
+}
+
+void free_insn_slot(kprobe_opcode_t *slot)
+{
+	struct kprobe_insn_page *kip;
+	struct hlist_node *pos;
+
+	hlist_for_each(pos, &kprobe_insn_pages) {
+		kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+		if (kip->insns <= slot &&
+		    slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
+			int i = (slot - kip->insns) / MAX_INSN_SIZE;
+			kip->slot_used[i] = 0;
+			kip->nused--;
+			if (kip->nused == 0) {
+				/*
+				 * Page is no longer in use.  Free it unless
+				 * it's the last one.  We keep the last one
+				 * so as not to have to set it up again the
+				 * next time somebody inserts a probe.
+				 */
+				hlist_del(&kip->hlist);
+				if (hlist_empty(&kprobe_insn_pages)) {
+					INIT_HLIST_NODE(&kip->hlist);
+					hlist_add_head(&kip->hlist,
+						&kprobe_insn_pages);
+				} else {
+					module_free(NULL, kip->insns);
+					kfree(kip);
+				}
+			}
+			return;
+		}
+	}
+}
+
 /* Locks kprobe: irqs must be disabled */
 void lock_kprobes(void)
 {
-- 
cgit v1.2.3-59-g8ed1b


From 802eae7c800fb7f583e6c06afa363585af2bef00 Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Mon, 27 Jun 2005 15:17:08 -0700
Subject: [PATCH] Return probe redesign: architecture independent changes

The following is the second version of the function return probe patches
I sent out earlier this week.  Changes since my last submission include:

* Fix in ppc64 code removing an unneeded call to re-enable preemption
* Fix a build problem in ia64 when kprobes was turned off
* Added another BUG_ON check to each of the architecture trampoline
  handlers

My initial patch description ==>

 From my experiences with adding return probes to x86_64 and ia64, and the
feedback on LKML to those patches, I think we can simplify the design
for return probes.

The following patch tweaks the original design such that:

* Instead of storing the stack address in the return probe instance, the
  task pointer is stored.  This gives us all we need in order to:
    - find the correct return probe instance when we enter the trampoline
      (even if we are recursing)
    - find all left-over return probe instances when the task is going away

  This has the side effect of simplifying the implementation since more
  work can be done in kernel/kprobes.c since architecture specific knowledge
  of the stack layout is no longer required.  Specifically, we no longer have:
	- arch_get_kprobe_task()
	- arch_kprobe_flush_task()
	- get_rp_inst_tsk()
	- get_rp_inst()
	- trampoline_post_handler() <see next bullet>

* Instead of splitting the return probe handling and cleanup logic across
  the pre and post trampoline handlers, all the work is pushed into the
  pre function (trampoline_probe_handler), and then we skip single stepping
  the original function.  In this case the original instruction to be single
  stepped was just a NOP, and we can do without the extra interruption.

The new flow of events to having a return probe handler execute when a target
function exits is:

* At system initialization time, a kprobe is inserted at the beginning of
  kretprobe_trampoline.  kernel/kprobes.c use to handle this on it's own,
  but ia64 needed to do this a little differently (i.e. a function pointer
  is really a pointer to a structure containing the instruction pointer and
  a global pointer), so I added the notion of arch_init(), so that
  kernel/kprobes.c:init_kprobes() now allows architecture specific
  initialization by calling arch_init() before exiting.  Each architecture
  now registers a kprobe on it's own trampoline function.

* register_kretprobe() will insert a kprobe at the beginning of the targeted
  function with the kprobe pre_handler set to arch_prepare_kretprobe
  (still no change)

* When the target function is entered, the kprobe is fired, calling
  arch_prepare_kretprobe (still no change)

* In arch_prepare_kretprobe() we try to get a free instance and if one is
  available then we fill out the instance with a pointer to the return probe,
  the original return address, and a pointer to the task structure (instead
  of the stack address.)  Just like before we change the return address
  to the trampoline function and mark the instance as used.

  If multiple return probes are registered for a given target function,
  then arch_prepare_kretprobe() will get called multiple times for the same
  task (since our kprobe implementation is able to handle multiple kprobes
  at the same address.)  Past the first call to arch_prepare_kretprobe,
  we end up with the original address stored in the return probe instance
  pointing to our trampoline function. (This is a significant difference
  from the original arch_prepare_kretprobe design.)

* Target function executes like normal and then returns to kretprobe_trampoline.

* kprobe inserted on the first instruction of kretprobe_trampoline is fired
  and calls trampoline_probe_handler() (no change here)

* trampoline_probe_handler() consumes each of the instances associated with
  the current task by calling the registered handler function and marking
  the instance as unused until an instance is found that has a return address
  different then the trampoline function.

  (change similar to my previous ia64 RFC)

* If the task is killed with some left-over return probe instances (meaning
  that a target function was entered, but never returned), then we just
  free any instances associated with the task.  (Not much different other
  then we can handle this without calling architecture specific functions.)

  There is a known problem that this patch does not yet solve where
  registering a return probe flush_old_exec or flush_thread will put us
  in a bad state.  Most likely the best way to handle this is to not allow
  registering return probes on these two functions.

  (Significant change)

This patch series applies to the 2.6.12-rc6-mm1 kernel, and provides:
  * kernel/kprobes.c changes
  * i386 patch of existing return probes implementation
  * x86_64 patch of existing return probe implementation
  * ia64 implementation
  * ppc64 implementation (provided by Ananth)

This patch implements the architecture independant changes for a reworking
of the kprobes based function return probes design. Changes include:

  * Removing functions for querying a return probe instance off a stack address
  * Removing the stack_addr field from the kretprobe_instance definition,
    and adding a task pointer
  * Adding architecture specific initialization via arch_init()
  * Removing extern definitions for the architecture trampoline functions
    (this isn't needed anymore since the architecture handles the
     initialization of the kprobe in the return probe trampoline function.)

Signed-off-by: Rusty Lynch <rusty.lynch@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 28 +++-----------------
 kernel/kprobes.c        | 69 ++++++++++++++-----------------------------------
 2 files changed, 22 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index d304d4579856..b7a194c4362a 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -104,33 +104,12 @@ struct jprobe {
 };
 
 #ifdef ARCH_SUPPORTS_KRETPROBES
-extern int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs);
-extern void trampoline_post_handler(struct kprobe *p, struct pt_regs *regs,
-							unsigned long flags);
-extern struct task_struct *arch_get_kprobe_task(void *ptr);
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
-extern void arch_kprobe_flush_task(struct task_struct *tk);
 #else /* ARCH_SUPPORTS_KRETPROBES */
-static inline void kretprobe_trampoline(void)
-{
-}
-static inline int trampoline_probe_handler(struct kprobe *p,
-						struct pt_regs *regs)
-{
-	return 0;
-}
-static inline void trampoline_post_handler(struct kprobe *p,
-				struct pt_regs *regs, unsigned long flags)
-{
-}
 static inline void arch_prepare_kretprobe(struct kretprobe *rp,
 					struct pt_regs *regs)
 {
 }
-static inline void arch_kprobe_flush_task(struct task_struct *tk)
-{
-}
-#define arch_get_kprobe_task(ptr) ((struct task_struct *)NULL)
 #endif /* ARCH_SUPPORTS_KRETPROBES */
 /*
  * Function-return probe -
@@ -155,8 +134,8 @@ struct kretprobe_instance {
 	struct hlist_node uflist; /* either on free list or used list */
 	struct hlist_node hlist;
 	struct kretprobe *rp;
-	void *ret_addr;
-	void *stack_addr;
+	kprobe_opcode_t *ret_addr;
+	struct task_struct *task;
 };
 
 #ifdef CONFIG_KPROBES
@@ -176,6 +155,7 @@ extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
+extern int arch_init(void);
 extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
 extern void free_insn_slot(kprobe_opcode_t *slot);
@@ -196,8 +176,6 @@ int register_kretprobe(struct kretprobe *rp);
 void unregister_kretprobe(struct kretprobe *rp);
 
 struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp);
-struct kretprobe_instance *get_rp_inst(void *sara);
-struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk);
 void add_rp_inst(struct kretprobe_instance *ri);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 65242529a75f..90c0e82b650c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -240,12 +240,6 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
-struct kprobe trampoline_p = {
-		.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
-		.pre_handler = trampoline_probe_handler,
-		.post_handler = trampoline_post_handler
-};
-
 struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
@@ -264,35 +258,18 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
-struct kretprobe_instance *get_rp_inst(void *sara)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct task_struct *tsk;
-	struct kretprobe_instance *ri;
-
-	tsk = arch_get_kprobe_task(sara);
-	head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
-	hlist_for_each_entry(ri, node, head, hlist) {
-		if (ri->stack_addr == sara)
-			return ri;
-	}
-	return NULL;
-}
-
 void add_rp_inst(struct kretprobe_instance *ri)
 {
-	struct task_struct *tsk;
 	/*
 	 * Remove rp inst off the free list -
 	 * Add it back when probed function returns
 	 */
 	hlist_del(&ri->uflist);
-	tsk = arch_get_kprobe_task(ri->stack_addr);
+
 	/* Add rp inst onto table */
 	INIT_HLIST_NODE(&ri->hlist);
 	hlist_add_head(&ri->hlist,
-			&kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
+			&kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
 
 	/* Also add this rp inst to the used list. */
 	INIT_HLIST_NODE(&ri->uflist);
@@ -319,34 +296,25 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
 	return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
 }
 
-struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
-{
-	struct task_struct *tsk;
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct kretprobe_instance *ri;
-
-	head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
-
-	hlist_for_each_entry(ri, node, head, hlist) {
-		tsk = arch_get_kprobe_task(ri->stack_addr);
-		if (tsk == tk)
-			return ri;
-	}
-	return NULL;
-}
-
 /*
- * This function is called from do_exit or do_execv when task tk's stack is
- * about to be recycled. Recycle any function-return probe instances
- * associated with this task. These represent probed functions that have
- * been called but may never return.
+ * This function is called from exit_thread or flush_thread when task tk's
+ * stack is being recycled so that we can recycle any function-return probe
+ * instances associated with this task. These left over instances represent
+ * probed functions that have been called but will never return.
  */
 void kprobe_flush_task(struct task_struct *tk)
 {
+        struct kretprobe_instance *ri;
+        struct hlist_head *head;
+	struct hlist_node *node, *tmp;
 	unsigned long flags = 0;
+
 	spin_lock_irqsave(&kprobe_lock, flags);
-	arch_kprobe_flush_task(tk);
+        head = kretprobe_inst_table_head(current);
+        hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task == tk)
+                        recycle_rp_inst(ri);
+        }
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
@@ -606,9 +574,10 @@ static int __init init_kprobes(void)
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
 	}
 
-	err = register_die_notifier(&kprobe_exceptions_nb);
-	/* Register the trampoline probe for return probe */
-	register_kprobe(&trampoline_p);
+	err = arch_init();
+	if (!err)
+		err = register_die_notifier(&kprobe_exceptions_nb);
+
 	return err;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From da9091ee3b5f9808c64abb925cefe7b100018614 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Mon, 27 Jun 2005 15:24:30 -0700
Subject: [PATCH] ide: it8212 backport for Bartlomiej IDE

This lets you throw out the iteraid stuff that has ended up back in due
to stupid goings on in the IDE world. Its the same heavily tested code
shipped in Fedora/Red Hat products but without the other dependancies on
the Bartlomiej IDE layer.

Pre-requisite: the ide-disk patch I sent to handle pure LBA devices.

Obviously you lose things like hot unplug with the Bartlomiej IDE layer
at the moment but that won't matter to most users.

The patch does the following
- Add IT8211/12 to pci_ids.h
- Add Makefile/Kconfig entry
- Add it8212 driver

No core IDE code is touched by this diff

Embedded system testing and the ability to force raid mode off by David
Howells

Made possible by the ite reference code, documentation and also several
clarifications and pieces of assistance provided by ITE themselves

Signed-off-by: Alan Cox <alan@redhat.com>
Acked-by: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/Kconfig      |   6 +
 drivers/ide/pci/Makefile |   1 +
 drivers/ide/pci/it821x.c | 812 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h  |   2 +
 4 files changed, 821 insertions(+)
 create mode 100644 drivers/ide/pci/it821x.c

(limited to 'include/linux')

diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig
index 0273f124a4f7..5f33df47aa74 100644
--- a/drivers/ide/Kconfig
+++ b/drivers/ide/Kconfig
@@ -606,6 +606,12 @@ config BLK_DEV_IT8172
 	  <http://www.ite.com.tw/ia/brief_it8172bsp.htm>; picture of the
 	  board at <http://www.mvista.com/partners/semiconductor/ite.html>.
 
+config BLK_DEV_IT821X
+	tristate "IT821X IDE support"
+	help
+	  This driver adds support for the ITE 8211 IDE controller and the
+	  IT 8212 IDE RAID controller in both RAID and pass-through mode.
+
 config BLK_DEV_NS87415
 	tristate "NS87415 chipset support"
 	help
diff --git a/drivers/ide/pci/Makefile b/drivers/ide/pci/Makefile
index 55e6e553e497..af46226c1796 100644
--- a/drivers/ide/pci/Makefile
+++ b/drivers/ide/pci/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_BLK_DEV_HPT34X)		+= hpt34x.o
 obj-$(CONFIG_BLK_DEV_HPT366)		+= hpt366.o
 #obj-$(CONFIG_BLK_DEV_HPT37X)		+= hpt37x.o
 obj-$(CONFIG_BLK_DEV_IT8172)		+= it8172.o
+obj-$(CONFIG_BLK_DEV_IT821X)		+= it821x.o
 obj-$(CONFIG_BLK_DEV_NS87415)		+= ns87415.o
 obj-$(CONFIG_BLK_DEV_OPTI621)		+= opti621.o
 obj-$(CONFIG_BLK_DEV_PDC202XX_OLD)	+= pdc202xx_old.o
diff --git a/drivers/ide/pci/it821x.c b/drivers/ide/pci/it821x.c
new file mode 100644
index 000000000000..e440036e651f
--- /dev/null
+++ b/drivers/ide/pci/it821x.c
@@ -0,0 +1,812 @@
+
+/*
+ * linux/drivers/ide/pci/it821x.c		Version 0.09	December 2004
+ *
+ * Copyright (C) 2004		Red Hat <alan@redhat.com>
+ *
+ *  May be copied or modified under the terms of the GNU General Public License
+ *  Based in part on the ITE vendor provided SCSI driver.
+ *
+ *  Documentation available from
+ * 	http://www.ite.com.tw/pc/IT8212F_V04.pdf
+ *  Some other documents are NDA.
+ *
+ *  The ITE8212 isn't exactly a standard IDE controller. It has two
+ *  modes. In pass through mode then it is an IDE controller. In its smart
+ *  mode its actually quite a capable hardware raid controller disguised
+ *  as an IDE controller. Smart mode only understands DMA read/write and
+ *  identify, none of the fancier commands apply. The IT8211 is identical
+ *  in other respects but lacks the raid mode.
+ *
+ *  Errata:
+ *  o	Rev 0x10 also requires master/slave hold the same DMA timings and
+ *	cannot do ATAPI MWDMA.
+ *  o	The identify data for raid volumes lacks CHS info (technically ok)
+ *	but also fails to set the LBA28 and other bits. We fix these in
+ *	the IDE probe quirk code.
+ *  o	If you write LBA48 sized I/O's (ie > 256 sector) in smart mode
+ *	raid then the controller firmware dies
+ *  o	Smart mode without RAID doesn't clear all the necessary identify
+ *	bits to reduce the command set to the one used
+ *
+ *  This has a few impacts on the driver
+ *  - In pass through mode we do all the work you would expect
+ *  - In smart mode the clocking set up is done by the controller generally
+ *    but we must watch the other limits and filter.
+ *  - There are a few extra vendor commands that actually talk to the
+ *    controller but only work PIO with no IRQ.
+ *
+ *  Vendor areas of the identify block in smart mode are used for the
+ *  timing and policy set up. Each HDD in raid mode also has a serial
+ *  block on the disk. The hardware extra commands are get/set chip status,
+ *  rebuild, get rebuild status.
+ *
+ *  In Linux the driver supports pass through mode as if the device was
+ *  just another IDE controller. If the smart mode is running then
+ *  volumes are managed by the controller firmware and each IDE "disk"
+ *  is a raid volume. Even more cute - the controller can do automated
+ *  hotplug and rebuild.
+ *
+ *  The pass through controller itself is a little demented. It has a
+ *  flaw that it has a single set of PIO/MWDMA timings per channel so
+ *  non UDMA devices restrict each others performance. It also has a
+ *  single clock source per channel so mixed UDMA100/133 performance
+ *  isn't perfect and we have to pick a clock. Thankfully none of this
+ *  matters in smart mode. ATAPI DMA is not currently supported.
+ *
+ *  It seems the smart mode is a win for RAID1/RAID10 but otherwise not.
+ *
+ *  TODO
+ *	-	ATAPI UDMA is ok but not MWDMA it seems
+ *	-	RAID configuration ioctls
+ *	-	Move to libata once it grows up
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/ide.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+
+struct it821x_dev
+{
+	unsigned int smart:1,		/* Are we in smart raid mode */
+		timing10:1;		/* Rev 0x10 */
+	u8	clock_mode;		/* 0, ATA_50 or ATA_66 */
+	u8	want[2][2];		/* Mode/Pri log for master slave */
+	/* We need these for switching the clock when DMA goes on/off
+	   The high byte is the 66Mhz timing */
+	u16	pio[2];			/* Cached PIO values */
+	u16	mwdma[2];		/* Cached MWDMA values */
+	u16	udma[2];		/* Cached UDMA values (per drive) */
+};
+
+#define ATA_66		0
+#define ATA_50		1
+#define ATA_ANY		2
+
+#define UDMA_OFF	0
+#define MWDMA_OFF	0
+
+/*
+ *	We allow users to force the card into non raid mode without
+ *	flashing the alternative BIOS. This is also neccessary right now
+ *	for embedded platforms that cannot run a PC BIOS but are using this
+ *	device.
+ */
+
+static int it8212_noraid;
+
+/**
+ *	it821x_program	-	program the PIO/MWDMA registers
+ *	@drive: drive to tune
+ *
+ *	Program the PIO/MWDMA timing for this channel according to the
+ *	current clock.
+ */
+
+static void it821x_program(ide_drive_t *drive, u16 timing)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int channel = hwif->channel;
+	u8 conf;
+
+	/* Program PIO/MWDMA timing bits */
+	if(itdev->clock_mode == ATA_66)
+		conf = timing >> 8;
+	else
+		conf = timing & 0xFF;
+	pci_write_config_byte(hwif->pci_dev, 0x54 + 4 * channel, conf);
+}
+
+/**
+ *	it821x_program_udma	-	program the UDMA registers
+ *	@drive: drive to tune
+ *
+ *	Program the UDMA timing for this drive according to the
+ *	current clock.
+ */
+
+static void it821x_program_udma(ide_drive_t *drive, u16 timing)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int channel = hwif->channel;
+	int unit = drive->select.b.unit;
+	u8 conf;
+
+	/* Program UDMA timing bits */
+	if(itdev->clock_mode == ATA_66)
+		conf = timing >> 8;
+	else
+		conf = timing & 0xFF;
+	if(itdev->timing10 == 0)
+		pci_write_config_byte(hwif->pci_dev, 0x56 + 4 * channel + unit, conf);
+	else {
+		pci_write_config_byte(hwif->pci_dev, 0x56 + 4 * channel, conf);
+		pci_write_config_byte(hwif->pci_dev, 0x56 + 4 * channel + 1, conf);
+	}
+}
+
+
+/**
+ *	it821x_clock_strategy
+ *	@hwif: hardware interface
+ *
+ *	Select between the 50 and 66Mhz base clocks to get the best
+ *	results for this interface.
+ */
+
+static void it821x_clock_strategy(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+
+	u8 unit = drive->select.b.unit;
+	ide_drive_t *pair = &hwif->drives[1-unit];
+
+	int clock, altclock;
+	u8 v;
+	int sel = 0;
+
+	if(itdev->want[0][0] > itdev->want[1][0]) {
+		clock = itdev->want[0][1];
+		altclock = itdev->want[1][1];
+	} else {
+		clock = itdev->want[1][1];
+		altclock = itdev->want[0][1];
+	}
+
+	/* Master doesn't care does the slave ? */
+	if(clock == ATA_ANY)
+		clock = altclock;
+
+	/* Nobody cares - keep the same clock */
+	if(clock == ATA_ANY)
+		return;
+	/* No change */
+	if(clock == itdev->clock_mode)
+		return;
+
+	/* Load this into the controller ? */
+	if(clock == ATA_66)
+		itdev->clock_mode = ATA_66;
+	else {
+		itdev->clock_mode = ATA_50;
+		sel = 1;
+	}
+	pci_read_config_byte(hwif->pci_dev, 0x50, &v);
+	v &= ~(1 << (1 + hwif->channel));
+	v |= sel << (1 + hwif->channel);
+	pci_write_config_byte(hwif->pci_dev, 0x50, v);
+
+	/*
+	 *	Reprogram the UDMA/PIO of the pair drive for the switch
+	 *	MWDMA will be dealt with by the dma switcher
+	 */
+	if(pair && itdev->udma[1-unit] != UDMA_OFF) {
+		it821x_program_udma(pair, itdev->udma[1-unit]);
+		it821x_program(pair, itdev->pio[1-unit]);
+	}
+	/*
+	 *	Reprogram the UDMA/PIO of our drive for the switch.
+	 *	MWDMA will be dealt with by the dma switcher
+	 */
+	if(itdev->udma[unit] != UDMA_OFF) {
+		it821x_program_udma(drive, itdev->udma[unit]);
+		it821x_program(drive, itdev->pio[unit]);
+	}
+}
+
+/**
+ *	it821x_ratemask	-	Compute available modes
+ *	@drive: IDE drive
+ *
+ *	Compute the available speeds for the devices on the interface. This
+ *	is all modes to ATA133 clipped by drive cable setup.
+ */
+
+static u8 it821x_ratemask (ide_drive_t *drive)
+{
+	u8 mode	= 4;
+	if (!eighty_ninty_three(drive))
+		mode = min(mode, (u8)1);
+	return mode;
+}
+
+/**
+ *	it821x_tuneproc	-	tune a drive
+ *	@drive: drive to tune
+ *	@mode_wanted: the target operating mode
+ *
+ *	Load the timing settings for this device mode into the
+ *	controller. By the time we are called the mode has been
+ *	modified as neccessary to handle the absence of seperate
+ *	master/slave timers for MWDMA/PIO.
+ *
+ *	This code is only used in pass through mode.
+ */
+
+static void it821x_tuneproc (ide_drive_t *drive, byte mode_wanted)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int unit = drive->select.b.unit;
+
+	/* Spec says 89 ref driver uses 88 */
+	static u16 pio[]	= { 0xAA88, 0xA382, 0xA181, 0x3332, 0x3121 };
+	static u8 pio_want[]    = { ATA_66, ATA_66, ATA_66, ATA_66, ATA_ANY };
+
+	if(itdev->smart)
+		return;
+
+	/* We prefer 66Mhz clock for PIO 0-3, don't care for PIO4 */
+	itdev->want[unit][1] = pio_want[mode_wanted];
+	itdev->want[unit][0] = 1;	/* PIO is lowest priority */
+	itdev->pio[unit] = pio[mode_wanted];
+	it821x_clock_strategy(drive);
+	it821x_program(drive, itdev->pio[unit]);
+}
+
+/**
+ *	it821x_tune_mwdma	-	tune a channel for MWDMA
+ *	@drive: drive to set up
+ *	@mode_wanted: the target operating mode
+ *
+ *	Load the timing settings for this device mode into the
+ *	controller when doing MWDMA in pass through mode. The caller
+ *	must manage the whole lack of per device MWDMA/PIO timings and
+ *	the shared MWDMA/PIO timing register.
+ */
+
+static void it821x_tune_mwdma (ide_drive_t *drive, byte mode_wanted)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = (void *)ide_get_hwifdata(hwif);
+	int unit = drive->select.b.unit;
+	int channel = hwif->channel;
+	u8 conf;
+
+	static u16 dma[]	= { 0x8866, 0x3222, 0x3121 };
+	static u8 mwdma_want[]	= { ATA_ANY, ATA_66, ATA_ANY };
+
+	itdev->want[unit][1] = mwdma_want[mode_wanted];
+	itdev->want[unit][0] = 2;	/* MWDMA is low priority */
+	itdev->mwdma[unit] = dma[mode_wanted];
+	itdev->udma[unit] = UDMA_OFF;
+
+	/* UDMA bits off - Revision 0x10 do them in pairs */
+	pci_read_config_byte(hwif->pci_dev, 0x50, &conf);
+	if(itdev->timing10)
+		conf |= channel ? 0x60: 0x18;
+	else
+		conf |= 1 << (3 + 2 * channel + unit);
+	pci_write_config_byte(hwif->pci_dev, 0x50, conf);
+
+	it821x_clock_strategy(drive);
+	/* FIXME: do we need to program this ? */
+	/* it821x_program(drive, itdev->mwdma[unit]); */
+}
+
+/**
+ *	it821x_tune_udma	-	tune a channel for UDMA
+ *	@drive: drive to set up
+ *	@mode_wanted: the target operating mode
+ *
+ *	Load the timing settings for this device mode into the
+ *	controller when doing UDMA modes in pass through.
+ */
+
+static void it821x_tune_udma (ide_drive_t *drive, byte mode_wanted)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int unit = drive->select.b.unit;
+	int channel = hwif->channel;
+	u8 conf;
+
+	static u16 udma[]	= { 0x4433, 0x4231, 0x3121, 0x2121, 0x1111, 0x2211, 0x1111 };
+	static u8 udma_want[]	= { ATA_ANY, ATA_50, ATA_ANY, ATA_66, ATA_66, ATA_50, ATA_66 };
+
+	itdev->want[unit][1] = udma_want[mode_wanted];
+	itdev->want[unit][0] = 3;	/* UDMA is high priority */
+	itdev->mwdma[unit] = MWDMA_OFF;
+	itdev->udma[unit] = udma[mode_wanted];
+	if(mode_wanted >= 5)
+		itdev->udma[unit] |= 0x8080;	/* UDMA 5/6 select on */
+
+	/* UDMA on. Again revision 0x10 must do the pair */
+	pci_read_config_byte(hwif->pci_dev, 0x50, &conf);
+	if(itdev->timing10)
+		conf &= channel ? 0x9F: 0xE7;
+	else
+		conf &= ~ (1 << (3 + 2 * channel + unit));
+	pci_write_config_byte(hwif->pci_dev, 0x50, conf);
+
+	it821x_clock_strategy(drive);
+	it821x_program_udma(drive, itdev->udma[unit]);
+
+}
+
+/**
+ *	config_it821x_chipset_for_pio	-	set drive timings
+ *	@drive: drive to tune
+ *	@speed we want
+ *
+ *	Compute the best pio mode we can for a given device. We must
+ *	pick a speed that does not cause problems with the other device
+ *	on the cable.
+ */
+
+static void config_it821x_chipset_for_pio (ide_drive_t *drive, byte set_speed)
+{
+	u8 unit = drive->select.b.unit;
+	ide_hwif_t *hwif = drive->hwif;
+	ide_drive_t *pair = &hwif->drives[1-unit];
+	u8 speed = 0, set_pio	= ide_get_best_pio_mode(drive, 255, 5, NULL);
+	u8 pair_pio;
+
+	/* We have to deal with this mess in pairs */
+	if(pair != NULL) {
+		pair_pio = ide_get_best_pio_mode(pair, 255, 5, NULL);
+		/* Trim PIO to the slowest of the master/slave */
+		if(pair_pio < set_pio)
+			set_pio = pair_pio;
+	}
+	it821x_tuneproc(drive, set_pio);
+	speed = XFER_PIO_0 + set_pio;
+	/* XXX - We trim to the lowest of the pair so the other drive
+	   will always be fine at this point until we do hotplug passthru */
+
+	if (set_speed)
+		(void) ide_config_drive_speed(drive, speed);
+}
+
+/**
+ *	it821x_dma_read	-	DMA hook
+ *	@drive: drive for DMA
+ *
+ *	The IT821x has a single timing register for MWDMA and for PIO
+ *	operations. As we flip back and forth we have to reload the
+ *	clock. In addition the rev 0x10 device only works if the same
+ *	timing value is loaded into the master and slave UDMA clock
+ * 	so we must also reload that.
+ *
+ *	FIXME: we could figure out in advance if we need to do reloads
+ */
+
+static void it821x_dma_start(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int unit = drive->select.b.unit;
+	if(itdev->mwdma[unit] != MWDMA_OFF)
+		it821x_program(drive, itdev->mwdma[unit]);
+	else if(itdev->udma[unit] != UDMA_OFF && itdev->timing10)
+		it821x_program_udma(drive, itdev->udma[unit]);
+	ide_dma_start(drive);
+}
+
+/**
+ *	it821x_dma_write	-	DMA hook
+ *	@drive: drive for DMA stop
+ *
+ *	The IT821x has a single timing register for MWDMA and for PIO
+ *	operations. As we flip back and forth we have to reload the
+ *	clock.
+ */
+
+static int it821x_dma_end(ide_drive_t *drive)
+{
+	ide_hwif_t *hwif = drive->hwif;
+	int unit = drive->select.b.unit;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int ret = __ide_dma_end(drive);
+	if(itdev->mwdma[unit] != MWDMA_OFF)
+		it821x_program(drive, itdev->pio[unit]);
+	return ret;
+}
+
+
+/**
+ *	it821x_tune_chipset	-	set controller timings
+ *	@drive: Drive to set up
+ *	@xferspeed: speed we want to achieve
+ *
+ *	Tune the ITE chipset for the desired mode. If we can't achieve
+ *	the desired mode then tune for a lower one, but ultimately
+ *	make the thing work.
+ */
+
+static int it821x_tune_chipset (ide_drive_t *drive, byte xferspeed)
+{
+
+	ide_hwif_t *hwif	= drive->hwif;
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	u8 speed		= ide_rate_filter(it821x_ratemask(drive), xferspeed);
+
+	if(!itdev->smart) {
+		switch(speed) {
+			case XFER_PIO_4:
+			case XFER_PIO_3:
+			case XFER_PIO_2:
+			case XFER_PIO_1:
+			case XFER_PIO_0:
+				it821x_tuneproc(drive, (speed - XFER_PIO_0));
+				break;
+			/* MWDMA tuning is really hard because our MWDMA and PIO
+			   timings are kept in the same place. We can switch in the
+			   host dma on/off callbacks */
+			case XFER_MW_DMA_2:
+			case XFER_MW_DMA_1:
+			case XFER_MW_DMA_0:
+				it821x_tune_mwdma(drive, (speed - XFER_MW_DMA_0));
+				break;
+			case XFER_UDMA_6:
+			case XFER_UDMA_5:
+			case XFER_UDMA_4:
+			case XFER_UDMA_3:
+			case XFER_UDMA_2:
+			case XFER_UDMA_1:
+			case XFER_UDMA_0:
+				it821x_tune_udma(drive, (speed - XFER_UDMA_0));
+				break;
+			default:
+				return 1;
+		}
+	}
+	/*
+	 *	In smart mode the clocking is done by the host controller
+	 * 	snooping the mode we picked. The rest of it is not our problem
+	 */
+	return ide_config_drive_speed(drive, speed);
+}
+
+/**
+ *	config_chipset_for_dma	-	configure for DMA
+ *	@drive: drive to configure
+ *
+ *	Called by the IDE layer when it wants the timings set up.
+ */
+
+static int config_chipset_for_dma (ide_drive_t *drive)
+{
+	u8 speed	= ide_dma_speed(drive, it821x_ratemask(drive));
+
+	config_it821x_chipset_for_pio(drive, !speed);
+	it821x_tune_chipset(drive, speed);
+	return ide_dma_enable(drive);
+}
+
+/**
+ *	it821x_configure_drive_for_dma	-	set up for DMA transfers
+ *	@drive: drive we are going to set up
+ *
+ *	Set up the drive for DMA, tune the controller and drive as
+ *	required. If the drive isn't suitable for DMA or we hit
+ *	other problems then we will drop down to PIO and set up
+ *	PIO appropriately
+ */
+
+static int it821x_config_drive_for_dma (ide_drive_t *drive)
+{
+	ide_hwif_t *hwif	= drive->hwif;
+
+	if (ide_use_dma(drive)) {
+		if (config_chipset_for_dma(drive))
+			return hwif->ide_dma_on(drive);
+	}
+	config_it821x_chipset_for_pio(drive, 1);
+	return hwif->ide_dma_off_quietly(drive);
+}
+
+/**
+ *	ata66_it821x	-	check for 80 pin cable
+ *	@hwif: interface to check
+ *
+ *	Check for the presence of an ATA66 capable cable on the
+ *	interface. Problematic as it seems some cards don't have
+ *	the needed logic onboard.
+ */
+
+static unsigned int __devinit ata66_it821x(ide_hwif_t *hwif)
+{
+	/* The reference driver also only does disk side */
+	return 1;
+}
+
+/**
+ *	it821x_fixup	-	post init callback
+ *	@hwif: interface
+ *
+ *	This callback is run after the drives have been probed but
+ *	before anything gets attached. It allows drivers to do any
+ *	final tuning that is needed, or fixups to work around bugs.
+ */
+
+static void __devinit it821x_fixups(ide_hwif_t *hwif)
+{
+	struct it821x_dev *itdev = ide_get_hwifdata(hwif);
+	int i;
+
+	if(!itdev->smart) {
+		/*
+		 *	If we are in pass through mode then not much
+		 *	needs to be done, but we do bother to clear the
+		 *	IRQ mask as we may well be in PIO (eg rev 0x10)
+		 *	for now and we know unmasking is safe on this chipset.
+		 */
+		for (i = 0; i < 2; i++) {
+			ide_drive_t *drive = &hwif->drives[i];
+			if(drive->present)
+				drive->unmask = 1;
+		}
+		return;
+	}
+	/*
+	 *	Perform fixups on smart mode. We need to "lose" some
+	 *	capabilities the firmware lacks but does not filter, and
+	 *	also patch up some capability bits that it forgets to set
+	 *	in RAID mode.
+	 */
+
+	for(i = 0; i < 2; i++) {
+		ide_drive_t *drive = &hwif->drives[i];
+		struct hd_driveid *id;
+		u16 *idbits;
+
+		if(!drive->present)
+			continue;
+		id = drive->id;
+		idbits = (u16 *)drive->id;
+
+		/* Check for RAID v native */
+		if(strstr(id->model, "Integrated Technology Express")) {
+			/* In raid mode the ident block is slightly buggy
+			   We need to set the bits so that the IDE layer knows
+			   LBA28. LBA48 and DMA ar valid */
+			id->capability |= 3;		/* LBA28, DMA */
+			id->command_set_2 |= 0x0400;	/* LBA48 valid */
+			id->cfs_enable_2 |= 0x0400;	/* LBA48 on */
+			/* Reporting logic */
+			printk(KERN_INFO "%s: IT8212 %sRAID %d volume",
+				drive->name,
+				idbits[147] ? "Bootable ":"",
+				idbits[129]);
+				if(idbits[129] != 1)
+					printk("(%dK stripe)", idbits[146]);
+				printk(".\n");
+			/* Now the core code will have wrongly decided no DMA
+			   so we need to fix this */
+			hwif->ide_dma_off_quietly(drive);
+#ifdef CONFIG_IDEDMA_ONLYDISK
+			if (drive->media == ide_disk)
+#endif
+				hwif->ide_dma_check(drive);
+		} else {
+			/* Non RAID volume. Fixups to stop the core code
+			   doing unsupported things */
+			id->field_valid &= 1;
+			id->queue_depth = 0;
+			id->command_set_1 = 0;
+			id->command_set_2 &= 0xC400;
+			id->cfsse &= 0xC000;
+			id->cfs_enable_1 = 0;
+			id->cfs_enable_2 &= 0xC400;
+			id->csf_default &= 0xC000;
+			id->word127 = 0;
+			id->dlf = 0;
+			id->csfo = 0;
+			id->cfa_power = 0;
+			printk(KERN_INFO "%s: Performing identify fixups.\n",
+				drive->name);
+		}
+	}
+
+}
+
+/**
+ *	init_hwif_it821x	-	set up hwif structs
+ *	@hwif: interface to set up
+ *
+ *	We do the basic set up of the interface structure. The IT8212
+ *	requires several custom handlers so we override the default
+ *	ide DMA handlers appropriately
+ */
+
+static void __devinit init_hwif_it821x(ide_hwif_t *hwif)
+{
+	struct it821x_dev *idev = kmalloc(sizeof(struct it821x_dev), GFP_KERNEL);
+	u8 conf;
+
+	if(idev == NULL) {
+		printk(KERN_ERR "it821x: out of memory, falling back to legacy behaviour.\n");
+		goto fallback;
+	}
+	memset(idev, 0, sizeof(struct it821x_dev));
+	ide_set_hwifdata(hwif, idev);
+
+	pci_read_config_byte(hwif->pci_dev, 0x50, &conf);
+	if(conf & 1) {
+		idev->smart = 1;
+		hwif->atapi_dma = 0;
+		/* Long I/O's although allowed in LBA48 space cause the
+		   onboard firmware to enter the twighlight zone */
+		hwif->rqsize = 256;
+	}
+
+	/* Pull the current clocks from 0x50 also */
+	if (conf & (1 << (1 + hwif->channel)))
+		idev->clock_mode = ATA_50;
+	else
+		idev->clock_mode = ATA_66;
+
+	idev->want[0][1] = ATA_ANY;
+	idev->want[1][1] = ATA_ANY;
+
+	/*
+	 *	Not in the docs but according to the reference driver
+	 *	this is neccessary.
+	 */
+
+	pci_read_config_byte(hwif->pci_dev, 0x08, &conf);
+	if(conf == 0x10) {
+		idev->timing10 = 1;
+		hwif->atapi_dma = 0;
+		if(!idev->smart)
+			printk(KERN_WARNING "it821x: Revision 0x10, workarounds activated.\n");
+	}
+
+	hwif->speedproc = &it821x_tune_chipset;
+	hwif->tuneproc	= &it821x_tuneproc;
+
+	/* MWDMA/PIO clock switching for pass through mode */
+	if(!idev->smart) {
+		hwif->dma_start = &it821x_dma_start;
+		hwif->ide_dma_end = &it821x_dma_end;
+	}
+
+	hwif->drives[0].autotune = 1;
+	hwif->drives[1].autotune = 1;
+
+	if (!hwif->dma_base)
+		goto fallback;
+
+	hwif->ultra_mask = 0x7f;
+	hwif->mwdma_mask = 0x07;
+	hwif->swdma_mask = 0x07;
+
+	hwif->ide_dma_check = &it821x_config_drive_for_dma;
+	if (!(hwif->udma_four))
+		hwif->udma_four = ata66_it821x(hwif);
+
+	/*
+	 *	The BIOS often doesn't set up DMA on this controller
+	 *	so we always do it.
+	 */
+
+	hwif->autodma = 1;
+	hwif->drives[0].autodma = hwif->autodma;
+	hwif->drives[1].autodma = hwif->autodma;
+	return;
+fallback:
+	hwif->autodma = 0;
+	return;
+}
+
+static void __devinit it8212_disable_raid(struct pci_dev *dev)
+{
+	/* Reset local CPU, and set BIOS not ready */
+	pci_write_config_byte(dev, 0x5E, 0x01);
+
+	/* Set to bypass mode, and reset PCI bus */
+	pci_write_config_byte(dev, 0x50, 0x00);
+	pci_write_config_word(dev, PCI_COMMAND,
+			      PCI_COMMAND_PARITY | PCI_COMMAND_IO |
+			      PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+	pci_write_config_word(dev, 0x40, 0xA0F3);
+
+	pci_write_config_dword(dev,0x4C, 0x02040204);
+	pci_write_config_byte(dev, 0x42, 0x36);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0);
+}
+
+static unsigned int __devinit init_chipset_it821x(struct pci_dev *dev, const char *name)
+{
+	u8 conf;
+	static char *mode[2] = { "pass through", "smart" };
+
+	/* Force the card into bypass mode if so requested */
+	if (it8212_noraid) {
+		printk(KERN_INFO "it8212: forcing bypass mode.\n");
+		it8212_disable_raid(dev);
+	}
+	pci_read_config_byte(dev, 0x50, &conf);
+	printk(KERN_INFO "it821x: controller in %s mode.\n", mode[conf & 1]);
+	return 0;
+}
+
+
+#define DECLARE_ITE_DEV(name_str)			\
+	{						\
+		.name		= name_str,		\
+		.init_chipset	= init_chipset_it821x,	\
+		.init_hwif	= init_hwif_it821x,	\
+		.channels	= 2,			\
+		.autodma	= AUTODMA,		\
+		.bootable	= ON_BOARD,		\
+		.fixup	 	= it821x_fixups		\
+	}
+
+static ide_pci_device_t it821x_chipsets[] __devinitdata = {
+	/* 0 */ DECLARE_ITE_DEV("IT8212"),
+};
+
+/**
+ *	it821x_init_one	-	pci layer discovery entry
+ *	@dev: PCI device
+ *	@id: ident table entry
+ *
+ *	Called by the PCI code when it finds an ITE821x controller.
+ *	We then use the IDE PCI generic helper to do most of the work.
+ */
+
+static int __devinit it821x_init_one(struct pci_dev *dev, const struct pci_device_id *id)
+{
+	ide_setup_pci_device(dev, &it821x_chipsets[id->driver_data]);
+	return 0;
+}
+
+static struct pci_device_id it821x_pci_tbl[] = {
+	{ PCI_VENDOR_ID_ITE, PCI_DEVICE_ID_ITE_8211,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_ITE, PCI_DEVICE_ID_ITE_8212,  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ 0, },
+};
+
+MODULE_DEVICE_TABLE(pci, it821x_pci_tbl);
+
+static struct pci_driver driver = {
+	.name		= "ITE821x IDE",
+	.id_table	= it821x_pci_tbl,
+	.probe		= it821x_init_one,
+};
+
+static int __init it821x_ide_init(void)
+{
+	return ide_pci_register_driver(&driver);
+}
+
+module_init(it821x_ide_init);
+
+module_param_named(noraid, it8212_noraid, int, S_IRUGO);
+MODULE_PARM_DESC(it8212_noraid, "Force card into bypass mode");
+
+MODULE_AUTHOR("Alan Cox");
+MODULE_DESCRIPTION("PCI driver module for the ITE 821x");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index a3961e1d5183..1e0bc6a8d653 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1815,6 +1815,8 @@
 #define PCI_VENDOR_ID_ITE		0x1283
 #define PCI_DEVICE_ID_ITE_IT8172G	0x8172
 #define PCI_DEVICE_ID_ITE_IT8172G_AUDIO 0x0801
+#define PCI_DEVICE_ID_ITE_8211		0x8211
+#define PCI_DEVICE_ID_ITE_8212		0x8212
 #define PCI_DEVICE_ID_ITE_8872		0x8872
 #define PCI_DEVICE_ID_ITE_IT8330G_0	0xe886
 
-- 
cgit v1.2.3-59-g8ed1b


From 1ad275e3e7d253d44f03868e85977c908e334fed Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 27 Jun 2005 16:28:06 -0700
Subject: [PATCH] pcmcia: device and driver matching

The actual matching of pcmcia drivers and pcmcia devices.  The original
version of this was written by David Woodhouse.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pcmcia/ds.c             | 123 +++++++++++++++++++++++++++-
 include/linux/mod_devicetable.h |  33 ++++++++
 include/pcmcia/device_id.h      | 175 ++++++++++++++++++++++++++++++++++++++++
 include/pcmcia/ds.h             |   7 +-
 4 files changed, 336 insertions(+), 2 deletions(-)
 create mode 100644 include/pcmcia/device_id.h

(limited to 'include/linux')

diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 35d479b0df64..5701b93b2ddb 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -101,6 +101,9 @@ struct pcmcia_bus_socket {
 	u8			device_count; /* the number of devices, used
 					       * only internally and subject
 					       * to incorrectness and change */
+
+	u8			device_add_pending;
+	struct work_struct	device_add;
 };
 static spinlock_t pcmcia_dev_list_lock;
 
@@ -512,6 +515,10 @@ static struct pcmcia_device * pcmcia_device_add(struct pcmcia_bus_socket *s, uns
 
 	down(&device_add_lock);
 
+	/* max of 2 devices per card */
+	if (s->device_count == 2)
+		goto err_put;
+
 	p_dev = kmalloc(sizeof(struct pcmcia_device), GFP_KERNEL);
 	if (!p_dev)
 		goto err_put;
@@ -537,6 +544,8 @@ static struct pcmcia_device * pcmcia_device_add(struct pcmcia_bus_socket *s, uns
 	list_add_tail(&p_dev->socket_device_list, &s->devices_list);
 	spin_unlock_irqrestore(&pcmcia_dev_list_lock, flags);
 
+	pcmcia_device_query(p_dev);
+
 	if (device_register(&p_dev->dev)) {
 		spin_lock_irqsave(&pcmcia_dev_list_lock, flags);
 		list_del(&p_dev->socket_device_list);
@@ -591,14 +600,123 @@ static int pcmcia_card_add(struct pcmcia_socket *s)
 }
 
 
+static void pcmcia_delayed_add_pseudo_device(void *data)
+{
+	struct pcmcia_bus_socket *s = data;
+	pcmcia_device_add(s, 0);
+	s->device_add_pending = 0;
+}
+
+static inline void pcmcia_add_pseudo_device(struct pcmcia_bus_socket *s)
+{
+	if (!s->device_add_pending) {
+		schedule_work(&s->device_add);
+		s->device_add_pending = 1;
+	}
+	return;
+}
+
+
+static inline int pcmcia_devmatch(struct pcmcia_device *dev,
+				  struct pcmcia_device_id *did)
+{
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_MANF_ID) {
+		if ((!dev->has_manf_id) || (dev->manf_id != did->manf_id))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_CARD_ID) {
+		if ((!dev->has_card_id) || (dev->card_id != did->card_id))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_FUNCTION) {
+		if (dev->func != did->function)
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID1) {
+		if (!dev->prod_id[0])
+			return 0;
+		if (strcmp(did->prod_id[0], dev->prod_id[0]))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID2) {
+		if (!dev->prod_id[1])
+			return 0;
+		if (strcmp(did->prod_id[1], dev->prod_id[1]))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID3) {
+		if (!dev->prod_id[2])
+			return 0;
+		if (strcmp(did->prod_id[2], dev->prod_id[2]))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_PROD_ID4) {
+		if (!dev->prod_id[3])
+			return 0;
+		if (strcmp(did->prod_id[3], dev->prod_id[3]))
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO) {
+		/* handle pseudo multifunction devices:
+		 * there are at most two pseudo multifunction devices.
+		 * if we're matching against the first, schedule a
+		 * call which will then check whether there are two
+		 * pseudo devices, and if not, add the second one.
+		 */
+		if (dev->device_no == 0)
+			pcmcia_add_pseudo_device(dev->socket->pcmcia);
+
+		if (dev->device_no != did->device_no)
+			return 0;
+	}
+
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_FUNC_ID) {
+		if ((!dev->has_func_id) || (dev->func_id != did->func_id))
+			return 0;
+
+		/* if this is a pseudo-multi-function device,
+		 * we need explicit matches */
+		if (did->match_flags & PCMCIA_DEV_ID_MATCH_DEVICE_NO)
+			return 0;
+		if (dev->device_no)
+			return 0;
+
+		/* also, FUNC_ID matching needs to be activated by userspace
+		 * after it has re-checked that there is no possible module
+		 * with a prod_id/manf_id/card_id match.
+		 */
+		if (!dev->allow_func_id_match)
+			return 0;
+	}
+
+	dev->dev.driver_data = (void *) did;
+
+	return 1;
+}
+
+
 static int pcmcia_bus_match(struct device * dev, struct device_driver * drv) {
 	struct pcmcia_device * p_dev = to_pcmcia_dev(dev);
 	struct pcmcia_driver * p_drv = to_pcmcia_drv(drv);
+	struct pcmcia_device_id *did = p_drv->id_table;
 
 	/* matching by cardmgr */
 	if (p_dev->cardmgr == p_drv)
 		return 1;
 
+	while (did && did->match_flags) {
+		if (pcmcia_devmatch(p_dev, did))
+			return 1;
+		did++;
+	}
+
 	return 0;
 }
 
@@ -922,7 +1040,9 @@ static int bind_request(struct pcmcia_bus_socket *s, bind_info_t *bind_info)
 rescan:
 	p_dev->cardmgr = p_drv;
 
-	pcmcia_device_query(p_dev);
+	/* if a driver is already running, we can abort */
+	if (p_dev->dev.driver)
+		goto err_put_module;
 
 	/*
 	 * Prevent this racing with a card insertion.
@@ -1595,6 +1715,7 @@ static int __devinit pcmcia_bus_add_socket(struct class_device *class_dev)
 
 	init_waitqueue_head(&s->queue);
 	INIT_LIST_HEAD(&s->devices_list);
+	INIT_WORK(&s->device_add, pcmcia_delayed_add_pseudo_device, s);
 
 	/* Set up hotline to Card Services */
 	s->callback.owner = THIS_MODULE;
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index d6eb7b2efc04..e9651cd8310c 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -175,4 +175,37 @@ struct serio_device_id {
 };
 
 
+/* PCMCIA */
+
+struct pcmcia_device_id {
+	__u16		match_flags;
+
+	__u16		manf_id;
+	__u16 		card_id;
+
+	__u8  		func_id;
+
+	/* for real multi-function devices */
+	__u8  		function;
+
+	/* for pseude multi-function devices */
+	__u8  		device_no;
+
+	const char *	prod_id[4];
+	__u32 		prod_id_hash[4];
+
+	/* not matched against */
+	kernel_ulong_t	driver_info;
+};
+
+#define PCMCIA_DEV_ID_MATCH_MANF_ID	0x0001
+#define PCMCIA_DEV_ID_MATCH_CARD_ID	0x0002
+#define PCMCIA_DEV_ID_MATCH_FUNC_ID	0x0004
+#define PCMCIA_DEV_ID_MATCH_FUNCTION	0x0008
+#define PCMCIA_DEV_ID_MATCH_PROD_ID1	0x0010
+#define PCMCIA_DEV_ID_MATCH_PROD_ID2	0x0020
+#define PCMCIA_DEV_ID_MATCH_PROD_ID3	0x0040
+#define PCMCIA_DEV_ID_MATCH_PROD_ID4	0x0080
+#define PCMCIA_DEV_ID_MATCH_DEVICE_NO	0x0100
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/pcmcia/device_id.h b/include/pcmcia/device_id.h
new file mode 100644
index 000000000000..acf68656de3c
--- /dev/null
+++ b/include/pcmcia/device_id.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (2003-2004) 	Dominik Brodowski <linux@brodo.de>
+ *				David Woodhouse
+ *
+ * License: GPL v2
+ */
+
+#define PCMCIA_DEVICE_MANF_CARD(manf, card) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_MANF_ID| \
+			PCMCIA_DEV_ID_MATCH_CARD_ID, \
+	.manf_id = (manf), \
+	.card_id = (card), }
+
+#define PCMCIA_DEVICE_FUNC_ID(func) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_FUNC_ID, \
+	.func_id = (func), }
+
+#define PCMCIA_DEVICE_PROD_ID1(v1, vh1) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1, \
+	.prod_id = { (v1), NULL, NULL, NULL }, \
+	.prod_id_hash = { (vh1), 0, 0, 0 }, }
+
+#define PCMCIA_DEVICE_PROD_ID2(v2, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID2, \
+	.prod_id = { NULL, (v2), NULL, NULL },  \
+	.prod_id_hash = { 0, (vh2), 0, 0 }, }
+
+#define PCMCIA_DEVICE_PROD_ID12(v1, v2, vh1, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 }, }
+
+#define PCMCIA_DEVICE_PROD_ID13(v1, v3, vh1, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3, \
+	.prod_id = { (v1), NULL, (v3), NULL }, \
+	.prod_id_hash = { (vh1), 0, (vh3), 0 }, }
+
+#define PCMCIA_DEVICE_PROD_ID14(v1, v4, vh1, vh4) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID4, \
+	.prod_id = { (v1), NULL, NULL, (v4) }, \
+	.prod_id_hash = { (vh1), 0, 0, (vh4) }, }
+
+#define PCMCIA_DEVICE_PROD_ID123(v1, v2, v3, vh1, vh2, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3, \
+	.prod_id = { (v1), (v2), (v3), NULL },\
+	.prod_id_hash = { (vh1), (vh2), (vh3), 0 }, }
+
+#define PCMCIA_DEVICE_PROD_ID124(v1, v2, v4, vh1, vh2, vh4) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID4, \
+	.prod_id = { (v1), (v2), NULL, (v4) }, \
+	.prod_id_hash = { (vh1), (vh2), 0, (vh4) }, }
+
+#define PCMCIA_DEVICE_PROD_ID134(v1, v3, v4, vh1, vh3, vh4) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID4, \
+	.prod_id = { (v1), NULL, (v3), (v4) }, \
+	.prod_id_hash = { (vh1), 0, (vh3), (vh4) }, }
+
+#define PCMCIA_DEVICE_PROD_ID1234(v1, v2, v3, v4, vh1, vh2, vh3, vh4) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID4, \
+	.prod_id = { (v1), (v2), (v3), (v4) }, \
+	.prod_id_hash = { (vh1), (vh2), (vh3), (vh4) }, }
+
+
+/* multi-function devices */
+
+#define PCMCIA_MFC_DEVICE_MANF_CARD(mfc, manf, card) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_MANF_ID| \
+			PCMCIA_DEV_ID_MATCH_CARD_ID| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.manf_id = (manf), \
+	.card_id = (card), \
+	.function = (mfc), }
+
+#define PCMCIA_MFC_DEVICE_PROD_ID1(mfc, v1, vh1) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { (v1), NULL, NULL, NULL }, \
+	.prod_id_hash = { (vh1), 0, 0, 0 }, \
+	.function = (mfc), }
+
+#define PCMCIA_MFC_DEVICE_PROD_ID2(mfc, v2, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { NULL, (v2), NULL, NULL },  \
+	.prod_id_hash = { 0, (vh2), 0, 0 }, \
+	.function = (mfc), }
+
+#define PCMCIA_MFC_DEVICE_PROD_ID12(mfc, v1, v2, vh1, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 }, \
+	.function = (mfc), }
+
+#define PCMCIA_MFC_DEVICE_PROD_ID13(mfc, v1, v3, vh1, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { (v1), NULL, (v3), NULL }, \
+	.prod_id_hash = { (vh1), 0, (vh3), 0 }, \
+	.function = (mfc), }
+
+#define PCMCIA_MFC_DEVICE_PROD_ID123(mfc, v1, v2, v3, vh1, vh2, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { (v1), (v2), (v3), NULL },\
+	.prod_id_hash = { (vh1), (vh2), (vh3), 0 }, \
+	.function = (mfc), }
+
+/* pseudo multi-function devices */
+
+#define PCMCIA_PFC_DEVICE_MANF_CARD(mfc, manf, card) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_MANF_ID| \
+			PCMCIA_DEV_ID_MATCH_CARD_ID| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.manf_id = (manf), \
+	.card_id = (card), \
+	.device_no = (mfc), }
+
+#define PCMCIA_PFC_DEVICE_PROD_ID1(mfc, v1, vh1) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { (v1), NULL, NULL, NULL }, \
+	.prod_id_hash = { (vh1), 0, 0, 0 }, \
+	.device_no = (mfc), }
+
+#define PCMCIA_PFC_DEVICE_PROD_ID2(mfc, v2, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { NULL, (v2), NULL, NULL },  \
+	.prod_id_hash = { 0, (vh2), 0, 0 }, \
+	.device_no = (mfc), }
+
+#define PCMCIA_PFC_DEVICE_PROD_ID12(mfc, v1, v2, vh1, vh2) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 }, \
+	.device_no = (mfc), }
+
+#define PCMCIA_PFC_DEVICE_PROD_ID13(mfc, v1, v3, vh1, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { (v1), NULL, (v3), NULL }, \
+	.prod_id_hash = { (vh1), 0, (vh3), 0 }, \
+	.device_no = (mfc), }
+
+#define PCMCIA_PFC_DEVICE_PROD_ID123(mfc, v1, v2, v3, vh1, vh2, vh3) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { (v1), (v2), (v3), NULL },\
+	.prod_id_hash = { (vh1), (vh2), (vh3), 0 }, \
+	.device_no = (mfc), }
+
+
+#define PCMCIA_DEVICE_NULL { .match_flags = 0, }
diff --git a/include/pcmcia/ds.h b/include/pcmcia/ds.h
index 312fd958c901..c267edde9d0c 100644
--- a/include/pcmcia/ds.h
+++ b/include/pcmcia/ds.h
@@ -18,6 +18,8 @@
 
 #include <pcmcia/bulkmem.h>
 #include <pcmcia/cs_types.h>
+#include <pcmcia/device_id.h>
+#include <linux/mod_devicetable.h>
 
 typedef struct tuple_parse_t {
     tuple_t		tuple;
@@ -135,6 +137,7 @@ struct pcmcia_driver {
 	dev_link_t		*(*attach)(void);
 	void			(*detach)(dev_link_t *);
 	struct module		*owner;
+	struct pcmcia_device_id	*id_table;
 	struct device_driver	drv;
 };
 
@@ -173,7 +176,9 @@ struct pcmcia_device {
 	u8			has_manf_id:1;
 	u8			has_card_id:1;
 	u8			has_func_id:1;
-	u8			reserved:5;
+
+	u8			allow_func_id_match:1;
+	u8			reserved:4;
 
 	u8			func_id;
 	u16			manf_id;
-- 
cgit v1.2.3-59-g8ed1b


From ea7b38825bba66a81745a706da70a1c81adc95bd Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 27 Jun 2005 16:28:07 -0700
Subject: [PATCH] pcmcia: match for fake CIS

Add another match flag for devices needing a CIS override.  The driver will
only probe/attach if the CIS has been replaced before.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pcmcia/ds.c             |  8 +++++
 include/linux/mod_devicetable.h |  2 ++
 include/pcmcia/device_id.h      | 74 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index 3ac7a443f668..c0611d56eab2 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -733,6 +733,14 @@ static inline int pcmcia_devmatch(struct pcmcia_device *dev,
 			return 0;
 	}
 
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_FAKE_CIS) {
+		if (!dev->socket->fake_cis) {
+			/* FIXME: evaluate using firmware helpers to
+			 * automagically load it from userspace */
+			return 0;
+		}
+	}
+
 	dev->dev.driver_data = (void *) did;
 
 	return 1;
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index e9651cd8310c..c0106d68bb6d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -196,6 +196,7 @@ struct pcmcia_device_id {
 
 	/* not matched against */
 	kernel_ulong_t	driver_info;
+	char *		cisfile;
 };
 
 #define PCMCIA_DEV_ID_MATCH_MANF_ID	0x0001
@@ -207,5 +208,6 @@ struct pcmcia_device_id {
 #define PCMCIA_DEV_ID_MATCH_PROD_ID3	0x0040
 #define PCMCIA_DEV_ID_MATCH_PROD_ID4	0x0080
 #define PCMCIA_DEV_ID_MATCH_DEVICE_NO	0x0100
+#define PCMCIA_DEV_ID_MATCH_FAKE_CIS	0x0200
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/include/pcmcia/device_id.h b/include/pcmcia/device_id.h
index acf68656de3c..346d81ece287 100644
--- a/include/pcmcia/device_id.h
+++ b/include/pcmcia/device_id.h
@@ -171,5 +171,79 @@
 	.prod_id_hash = { (vh1), (vh2), (vh3), 0 }, \
 	.device_no = (mfc), }
 
+/* cards needing a CIS override */
+
+#define PCMCIA_DEVICE_CIS_MANF_CARD(manf, card, _cisfile) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_MANF_ID| \
+			PCMCIA_DEV_ID_MATCH_CARD_ID, \
+	.manf_id = (manf), \
+	.card_id = (card), \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_DEVICE_CIS_PROD_ID12(v1, v2, vh1, vh2, _cisfile) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 }, \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_DEVICE_CIS_PROD_ID123(v1, v2, v3, vh1, vh2, vh3, _cisfile) { \
+	.match_flags = PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID3, \
+	.prod_id = { (v1), (v2), (v3), NULL },\
+	.prod_id_hash = { (vh1), (vh2), (vh3), 0 }, \
+	.cisfile = (_cisfile)}
+
+
+#define PCMCIA_DEVICE_CIS_PROD_ID2(v2, vh2, _cisfile) { \
+	.match_flags =  PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2, \
+	.prod_id = { NULL, (v2), NULL, NULL },  \
+	.prod_id_hash = { 0, (vh2), 0, 0 }, \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_PFC_DEVICE_CIS_PROD_ID12(mfc, v1, v2, vh1, vh2, _cisfile) { \
+	.match_flags =  PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_DEVICE_NO, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 },\
+	.device_no = (mfc), \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_MFC_DEVICE_CIS_MANF_CARD(mfc, manf, card, _cisfile) { \
+	.match_flags =  PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_MANF_ID| \
+			PCMCIA_DEV_ID_MATCH_CARD_ID| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.manf_id = (manf), \
+	.card_id = (card), \
+	.function = (mfc), \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_MFC_DEVICE_CIS_PROD_ID12(mfc, v1, v2, vh1, vh2, _cisfile) { \
+	.match_flags =  PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID1| \
+			PCMCIA_DEV_ID_MATCH_PROD_ID2| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { (v1), (v2), NULL, NULL }, \
+	.prod_id_hash = { (vh1), (vh2), 0, 0 }, \
+	.function = (mfc), \
+	.cisfile = (_cisfile)}
+
+#define PCMCIA_MFC_DEVICE_CIS_PROD_ID4(mfc, v4, vh4, _cisfile) { \
+	.match_flags =  PCMCIA_DEV_ID_MATCH_FAKE_CIS | \
+			PCMCIA_DEV_ID_MATCH_PROD_ID4| \
+			PCMCIA_DEV_ID_MATCH_FUNCTION, \
+	.prod_id = { NULL, NULL, NULL, (v4) }, \
+	.prod_id_hash = { 0, 0, 0, (vh4) }, \
+	.function = (mfc), \
+	.cisfile = (_cisfile)}
+
 
 #define PCMCIA_DEVICE_NULL { .match_flags = 0, }
-- 
cgit v1.2.3-59-g8ed1b


From f602ff7eb4e44e7245bfeeba4d078144703fcd76 Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 27 Jun 2005 16:28:09 -0700
Subject: [PATCH] pcmcia: match "anonymous" cards

If a card doesn't provide _any_ information about itself, assume it is a
so-called "anonymous" card.  pcmciamtd will bind to it if it is configured to
do so.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pcmcia/ds.c             | 9 +++++++++
 include/linux/mod_devicetable.h | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pcmcia/ds.c b/drivers/pcmcia/ds.c
index f657a2a77b2b..66680699e913 100644
--- a/drivers/pcmcia/ds.c
+++ b/drivers/pcmcia/ds.c
@@ -746,6 +746,15 @@ static inline int pcmcia_devmatch(struct pcmcia_device *dev,
 		}
 	}
 
+	if (did->match_flags & PCMCIA_DEV_ID_MATCH_ANONYMOUS) {
+		int i;
+		for (i=0; i<4; i++)
+			if (dev->prod_id[i])
+				return 0;
+		if (dev->has_manf_id || dev->has_card_id || dev->has_func_id)
+			return 0;
+	}
+
 	dev->dev.driver_data = (void *) did;
 
 	return 1;
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index c0106d68bb6d..8a8dc82a941d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -209,5 +209,6 @@ struct pcmcia_device_id {
 #define PCMCIA_DEV_ID_MATCH_PROD_ID4	0x0080
 #define PCMCIA_DEV_ID_MATCH_DEVICE_NO	0x0100
 #define PCMCIA_DEV_ID_MATCH_FAKE_CIS	0x0200
+#define PCMCIA_DEV_ID_MATCH_ANONYMOUS	0x0400
 
 #endif /* LINUX_MOD_DEVICETABLE_H */
-- 
cgit v1.2.3-59-g8ed1b


From aecab27aeabaa897d69fc082686df314329830de Mon Sep 17 00:00:00 2001
From: Dominik Brodowski <linux@dominikbrodowski.net>
Date: Mon, 27 Jun 2005 16:28:56 -0700
Subject: [PATCH] pcmcia: mod_devicetable.h fix for different sizes in kernel-
 and userspace

The size of pointers may differ between (userspace) modpost and (kernelspace)
modules -- so fix mod_devicetable.h to reflect this possibility.

Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mod_devicetable.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 8a8dc82a941d..9b6d05172ed4 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -191,12 +191,22 @@ struct pcmcia_device_id {
 	/* for pseude multi-function devices */
 	__u8  		device_no;
 
-	const char *	prod_id[4];
 	__u32 		prod_id_hash[4];
 
+	/* not matched against in kernelspace*/
+#ifdef __KERNEL__
+	const char *	prod_id[4];
+#else
+	kernel_ulong_t	prod_id[4];
+#endif
+
 	/* not matched against */
 	kernel_ulong_t	driver_info;
+#ifdef __KERNEL__
 	char *		cisfile;
+#else
+	kernel_ulong_t	cisfile;
+#endif
 };
 
 #define PCMCIA_DEV_ID_MATCH_MANF_ID	0x0001
-- 
cgit v1.2.3-59-g8ed1b


From a5fe736eaf9bae1b45317313de04b564441b94f2 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Mon, 27 Jun 2005 22:47:18 -0400
Subject: Update is_multicast_ether_addr() definition; net/ieee80211.h
 cleanups.

---
 include/linux/etherdevice.h |  2 +-
 include/net/ieee80211.h     | 48 ++++++++++-----------------------------------
 2 files changed, 11 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index a1478258d002..8a2df4dfbc59 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -65,7 +65,7 @@ static inline int is_zero_ether_addr(const u8 *addr)
  */
 static inline int is_multicast_ether_addr(const u8 *addr)
 {
-	return addr[0] & 0x01;
+	return ((addr[0] != 0xff) && (0x01 & addr[0]));
 }
 
 /**
diff --git a/include/net/ieee80211.h b/include/net/ieee80211.h
index 7fe57f957a51..151c4f203559 100644
--- a/include/net/ieee80211.h
+++ b/include/net/ieee80211.h
@@ -94,6 +94,8 @@ struct eapol {
 	u16 length;
 } __attribute__ ((packed));
 
+#define IEEE80211_1ADDR_LEN 10
+#define IEEE80211_2ADDR_LEN 16
 #define IEEE80211_3ADDR_LEN 24
 #define IEEE80211_4ADDR_LEN 30
 #define IEEE80211_FCS_LEN    4
@@ -300,23 +302,6 @@ struct ieee80211_snap_hdr {
 #define WLAN_REASON_STA_REQ_ASSOC_WITHOUT_AUTH 9
 
 
-/* Information Element IDs */
-#define WLAN_EID_SSID 0
-#define WLAN_EID_SUPP_RATES 1
-#define WLAN_EID_FH_PARAMS 2
-#define WLAN_EID_DS_PARAMS 3
-#define WLAN_EID_CF_PARAMS 4
-#define WLAN_EID_TIM 5
-#define WLAN_EID_IBSS_PARAMS 6
-#define WLAN_EID_CHALLENGE 16
-#define WLAN_EID_RSN 48
-#define WLAN_EID_GENERIC 221
-
-#define IEEE80211_MGMT_HDR_LEN 24
-#define IEEE80211_DATA_HDR3_LEN 24
-#define IEEE80211_DATA_HDR4_LEN 30
-
-
 #define IEEE80211_STATMASK_SIGNAL (1<<0)
 #define IEEE80211_STATMASK_RSSI (1<<1)
 #define IEEE80211_STATMASK_NOISE (1<<2)
@@ -441,6 +426,8 @@ struct ieee80211_stats {
 
 struct ieee80211_device;
 
+#include "ieee80211_crypt.h"
+
 #define SEC_KEY_1         (1<<0)
 #define SEC_KEY_2         (1<<1)
 #define SEC_KEY_3         (1<<2)
@@ -488,15 +475,6 @@ Total: 28-2340 bytes
 
 */
 
-struct ieee80211_header_data {
-	u16 frame_ctl;
-	u16 duration_id;
-	u8 addr1[6];
-	u8 addr2[6];
-	u8 addr3[6];
-	u16 seq_ctrl;
-};
-
 #define BEACON_PROBE_SSID_ID_POSITION 12
 
 /* Management Frame Information Element Types */
@@ -541,7 +519,7 @@ struct ieee80211_info_element {
 */
 
 struct ieee80211_authentication {
-	struct ieee80211_header_data header;
+	struct ieee80211_hdr_3addr header;
 	u16 algorithm;
 	u16 transaction;
 	u16 status;
@@ -550,7 +528,7 @@ struct ieee80211_authentication {
 
 
 struct ieee80211_probe_response {
-	struct ieee80211_header_data header;
+	struct ieee80211_hdr_3addr header;
 	u32 time_stamp[2];
 	u16 beacon_interval;
 	u16 capability;
@@ -648,12 +626,6 @@ enum ieee80211_state {
 #define MAC_ARG(x) ((u8*)(x))[0],((u8*)(x))[1],((u8*)(x))[2],((u8*)(x))[3],((u8*)(x))[4],((u8*)(x))[5]
 
 
-extern inline int is_broadcast_ether_addr(const u8 *addr)
-{
-	return ((addr[0] == 0xff) && (addr[1] == 0xff) && (addr[2] == 0xff) &&   \
-		(addr[3] == 0xff) && (addr[4] == 0xff) && (addr[5] == 0xff));
-}
-
 #define CFG_IEEE80211_RESERVE_FCS (1<<0)
 #define CFG_IEEE80211_COMPUTE_FCS (1<<1)
 
@@ -787,21 +759,21 @@ extern inline int ieee80211_is_valid_mode(struct ieee80211_device *ieee, int mod
 
 extern inline int ieee80211_get_hdrlen(u16 fc)
 {
-	int hdrlen = 24;
+	int hdrlen = IEEE80211_3ADDR_LEN;
 
 	switch (WLAN_FC_GET_TYPE(fc)) {
 	case IEEE80211_FTYPE_DATA:
 		if ((fc & IEEE80211_FCTL_FROMDS) && (fc & IEEE80211_FCTL_TODS))
-			hdrlen = 30; /* Addr4 */
+			hdrlen = IEEE80211_4ADDR_LEN;
 		break;
 	case IEEE80211_FTYPE_CTL:
 		switch (WLAN_FC_GET_STYPE(fc)) {
 		case IEEE80211_STYPE_CTS:
 		case IEEE80211_STYPE_ACK:
-			hdrlen = 10;
+			hdrlen = IEEE80211_1ADDR_LEN;
 			break;
 		default:
-			hdrlen = 16;
+			hdrlen = IEEE80211_2ADDR_LEN;
 			break;
 		}
 		break;
-- 
cgit v1.2.3-59-g8ed1b


From c431ada45d65b305a6aab4557067e564b23ce5a5 Mon Sep 17 00:00:00 2001
From: Rajesh Shah <rajesh.shah@intel.com>
Date: Thu, 28 Apr 2005 00:25:45 -0700
Subject: [PATCH] acpi bridge hotadd: ACPI based root bridge hot-add

When you hot-plug a (root) bridge hierarchy, it may have p2p bridges and
devices attached to it that have not been configured by firmware.  In this
case, we need to configure the devices before starting them.  This patch
separates device start from device scan so that we can introduce the
configuration step in the middle.

I kept the existing semantics for pci_scan_bus() since there are a huge number
of callers to that function.

Also, I have no way of testing the changes I made to the parisc files, so this
needs review by those folks.  Sorry for the massive cross-post, this touches
files in many different places.

Signed-off-by: Rajesh Shah <rajesh.shah@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/pci/common.c   |  2 +-
 arch/i386/pci/legacy.c   |  2 ++
 arch/i386/pci/numa.c     |  2 ++
 arch/ia64/pci/pci.c      |  2 +-
 drivers/acpi/pci_bind.c  | 16 +++++++++++++++-
 drivers/acpi/pci_root.c  | 24 +++++++++++++++++++++++-
 drivers/parisc/dino.c    |  1 +
 drivers/parisc/lba_pci.c |  2 ++
 drivers/pci/probe.c      |  2 --
 include/linux/pci.h      |  8 ++++++--
 10 files changed, 53 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 2a2e79fbfef8..87325263cd4f 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -134,7 +134,7 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
 
 	printk("PCI: Probing PCI hardware (bus %02x)\n", busnum);
 
-	return pci_scan_bus(busnum, &pci_root_ops, NULL);
+	return pci_scan_bus_parented(NULL, busnum, &pci_root_ops, NULL);
 }
 
 extern u8 pci_cache_line_size;
diff --git a/arch/i386/pci/legacy.c b/arch/i386/pci/legacy.c
index 1492e3753869..149a9588c256 100644
--- a/arch/i386/pci/legacy.c
+++ b/arch/i386/pci/legacy.c
@@ -45,6 +45,8 @@ static int __init pci_legacy_init(void)
 
 	printk("PCI: Probing PCI hardware\n");
 	pci_root_bus = pcibios_scan_root(0);
+	if (pci_root_bus)
+		pci_bus_add_devices(pci_root_bus);
 
 	pcibios_fixup_peer_bridges();
 
diff --git a/arch/i386/pci/numa.c b/arch/i386/pci/numa.c
index 9e3695461899..adbe17a38f6f 100644
--- a/arch/i386/pci/numa.c
+++ b/arch/i386/pci/numa.c
@@ -115,6 +115,8 @@ static int __init pci_numa_init(void)
 		return 0;
 
 	pci_root_bus = pcibios_scan_root(0);
+	if (pci_root_bus)
+		pci_bus_add_devices(pci_root_bus);
 	if (num_online_nodes() > 1)
 		for_each_online_node(quad) {
 			if (quad == 0)
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index e3fc4edea113..c0661d3382e4 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -312,7 +312,7 @@ pci_acpi_scan_root(struct acpi_device *device, int domain, int bus)
 	acpi_walk_resources(device->handle, METHOD_NAME__CRS, add_window,
 			&info);
 
-	pbus = pci_scan_bus(bus, &pci_root_ops, controller);
+	pbus = pci_scan_bus_parented(NULL, bus, &pci_root_ops, controller);
 	if (pbus)
 		pcibios_setup_root_windows(pbus, controller);
 
diff --git a/drivers/acpi/pci_bind.c b/drivers/acpi/pci_bind.c
index 5d19b39e9e2b..7753df1f9fb8 100644
--- a/drivers/acpi/pci_bind.c
+++ b/drivers/acpi/pci_bind.c
@@ -129,6 +129,8 @@ acpi_pci_bind (
 	char			*pathname = NULL;
 	struct acpi_buffer	buffer = {0, NULL};
 	acpi_handle		handle = NULL;
+	struct pci_dev		*dev;
+	struct pci_bus 		*bus;
 
 	ACPI_FUNCTION_TRACE("acpi_pci_bind");
 
@@ -193,8 +195,20 @@ acpi_pci_bind (
 	 * Locate matching device in PCI namespace.  If it doesn't exist
 	 * this typically means that the device isn't currently inserted
 	 * (e.g. docking station, port replicator, etc.).
+	 * We cannot simply search the global pci device list, since
+	 * PCI devices are added to the global pci list when the root
+	 * bridge start ops are run, which may not have happened yet.
 	 */
-	data->dev = pci_find_slot(data->id.bus, PCI_DEVFN(data->id.device, data->id.function));
+	bus = pci_find_bus(data->id.segment, data->id.bus);
+	if (bus) {
+		list_for_each_entry(dev, &bus->devices, bus_list) {
+			if (dev->devfn == PCI_DEVFN(data->id.device,
+						data->id.function)) {
+				data->dev = dev;
+				break;
+			}
+		}
+	}
 	if (!data->dev) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, 
 			"Device %02x:%02x:%02x.%02x not present in PCI namespace\n",
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 7e6b8e3b2ed4..5d2f77fcd50c 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -46,6 +46,7 @@ ACPI_MODULE_NAME		("pci_root")
 
 static int acpi_pci_root_add (struct acpi_device *device);
 static int acpi_pci_root_remove (struct acpi_device *device, int type);
+static int acpi_pci_root_start (struct acpi_device *device);
 
 static struct acpi_driver acpi_pci_root_driver = {
 	.name =		ACPI_PCI_ROOT_DRIVER_NAME,
@@ -54,6 +55,7 @@ static struct acpi_driver acpi_pci_root_driver = {
 	.ops =		{
 				.add =    acpi_pci_root_add,
 				.remove = acpi_pci_root_remove,
+				.start =  acpi_pci_root_start,
 			},
 };
 
@@ -169,6 +171,7 @@ acpi_pci_root_add (
 	if (!root)
 		return_VALUE(-ENOMEM);
 	memset(root, 0, sizeof(struct acpi_pci_root));
+	INIT_LIST_HEAD(&root->node);
 
 	root->handle = device->handle;
 	strcpy(acpi_device_name(device), ACPI_PCI_ROOT_DEVICE_NAME);
@@ -298,12 +301,31 @@ acpi_pci_root_add (
 			root->id.bus);
 
 end:
-	if (result)
+	if (result) {
+		if (!list_empty(&root->node))
+			list_del(&root->node);
 		kfree(root);
+	}
 
 	return_VALUE(result);
 }
 
+static int
+acpi_pci_root_start (
+	struct acpi_device	*device)
+{
+	struct acpi_pci_root	*root;
+
+	ACPI_FUNCTION_TRACE("acpi_pci_root_start");
+
+	list_for_each_entry(root, &acpi_pci_roots, node) {
+		if (root->handle == device->handle) {
+			pci_bus_add_devices(root->bus);
+			return_VALUE(0);
+		}
+	}
+	return_VALUE(-ENODEV);
+}
 
 static int
 acpi_pci_root_remove (
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index b0d2a73d1d47..2f2dbef2c3b7 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -993,6 +993,7 @@ dino_driver_callback(struct parisc_device *dev)
 	bus = pci_scan_bus_parented(&dev->dev, dino_current_bus,
 				    &dino_cfg_ops, NULL);
 	if(bus) {
+		pci_bus_add_devices(bus);
 		/* This code *depends* on scanning being single threaded
 		 * if it isn't, this global bus number count will fail
 		 */
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index dc838804c0dd..7fdd80b7eb47 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -1570,6 +1570,8 @@ lba_driver_probe(struct parisc_device *dev)
 	lba_bus = lba_dev->hba.hba_bus =
 		pci_scan_bus_parented(&dev->dev, lba_dev->hba.bus_num.start,
 				cfg_ops, NULL);
+	if (lba_bus)
+		pci_bus_add_devices(lba_bus);
 
 	/* This is in lieu of calling pci_assign_unassigned_resources() */
 	if (is_pdc_pat()) {
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index fd48b201eb53..3dc00f0ca8a0 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -911,8 +911,6 @@ struct pci_bus * __devinit pci_scan_bus_parented(struct device *parent, int bus,
 
 	b->subordinate = pci_scan_child_bus(b);
 
-	pci_bus_add_devices(b);
-
 	return b;
 
 sys_create_link_err:
diff --git a/include/linux/pci.h b/include/linux/pci.h
index b5238bd18830..0e9844929fe3 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -734,16 +734,20 @@ void pcibios_update_irq(struct pci_dev *, int irq);
 /* Generic PCI functions used internally */
 
 extern struct pci_bus *pci_find_bus(int domain, int busnr);
+void pci_bus_add_devices(struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus, struct pci_ops *ops, void *sysdata);
 static inline struct pci_bus *pci_scan_bus(int bus, struct pci_ops *ops, void *sysdata)
 {
-	return pci_scan_bus_parented(NULL, bus, ops, sysdata);
+	struct pci_bus *root_bus;
+	root_bus = pci_scan_bus_parented(NULL, bus, ops, sysdata);
+	if (root_bus)
+		pci_bus_add_devices(root_bus);
+	return root_bus;
 }
 int pci_scan_slot(struct pci_bus *bus, int devfn);
 struct pci_dev * pci_scan_single_device(struct pci_bus *bus, int devfn);
 unsigned int pci_scan_child_bus(struct pci_bus *bus);
 void pci_bus_add_device(struct pci_dev *dev);
-void pci_bus_add_devices(struct pci_bus *bus);
 void pci_name_device(struct pci_dev *dev);
 char *pci_class_name(u32 class);
 void pci_read_bridge_bases(struct pci_bus *child);
-- 
cgit v1.2.3-59-g8ed1b


From b1bb248a5d2230a3d8ef42199c742194a8580b15 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Thu, 28 Apr 2005 00:25:58 -0700
Subject: [PATCH] ACPI based I/O APIC hot-plug: add interfaces

This patch adds the following new interfaces for I/O xAPIC
hotplug. The implementation of these interfaces depends on each
architecture.

    o int acpi_register_ioapic(acpi_handle handle, u64 phys_addr,
			       u32 gsi_base);

        This new interface is to add a new I/O xAPIC specified by
        phys_addr and gsi_base pair. phys_addr is the physical address
        to which the I/O xAPIC is mapped and gsi_base is global system
        interrupt base of the I/O xAPIC. acpi_register_ioapic returns
        0 on success, or negative value on error.

    o int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);

        This new interface is to remove a I/O xAPIC specified by
        gsi_base. acpi_unregister_ioapic returns 0 on success, or
        negative value on error.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/kernel/acpi/boot.c | 16 ++++++++++++++++
 arch/ia64/kernel/acpi.c      | 17 +++++++++++++++++
 include/linux/acpi.h         |  3 +++
 3 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index 9f63ae0f404b..f5360d88bcf4 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -507,6 +507,22 @@ acpi_unmap_lsapic(int cpu)
 EXPORT_SYMBOL(acpi_unmap_lsapic);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
+int
+acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+	/* TBD */
+	return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int
+acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
+{
+	/* TBD */
+	return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
 static unsigned long __init
 acpi_scan_rsdp (
 	unsigned long		start,
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 72dfd9e7de0f..ab798867acdf 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -825,4 +825,21 @@ acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret)
 	return AE_OK;
 }
 #endif /* CONFIG_NUMA */
+
+int
+acpi_register_ioapic (acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+	/* TBD */
+	return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int
+acpi_unregister_ioapic (acpi_handle handle, u32 gsi_base)
+{
+	/* TBD */
+	return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
 #endif /* CONFIG_ACPI_BOOT */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b123cc08773d..f5bc298707e1 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -407,6 +407,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu);
 int acpi_unmap_lsapic(int cpu);
 #endif /* CONFIG_ACPI_HOTPLUG_CPU */
 
+int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base);
+int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
+
 extern int acpi_mp_config;
 
 extern u32 pci_mmcfg_base_addr;
-- 
cgit v1.2.3-59-g8ed1b


From a0d399a808916d22c1c222c6b5ca4e8edd6d91a9 Mon Sep 17 00:00:00 2001
From: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Date: Thu, 28 Apr 2005 00:25:59 -0700
Subject: [PATCH] ACPI based I/O APIC hot-plug: acpiphp support

This patch adds PCI based I/O xAPIC hot-add support to ACPIPHP
driver. When PCI root bridge is hot-added, all PCI based I/O xAPICs
under the root bridge are hot-added by this patch. Hot-remove support
is TBD.

Signed-off-by: Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/hotplug/acpiphp_glue.c | 127 +++++++++++++++++++++++++++++++++++++
 include/linux/pci_ids.h            |   2 +
 2 files changed, 129 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index b4a921236252..424e7de181ae 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -552,6 +552,132 @@ static void remove_bridge(acpi_handle handle)
 	}
 }
 
+static struct pci_dev * get_apic_pci_info(acpi_handle handle)
+{
+	struct acpi_pci_id id;
+	struct pci_bus *bus;
+	struct pci_dev *dev;
+
+	if (ACPI_FAILURE(acpi_get_pci_id(handle, &id)))
+		return NULL;
+
+	bus = pci_find_bus(id.segment, id.bus);
+	if (!bus)
+		return NULL;
+
+	dev = pci_get_slot(bus, PCI_DEVFN(id.device, id.function));
+	if (!dev)
+		return NULL;
+
+	if ((dev->class != PCI_CLASS_SYSTEM_PIC_IOAPIC) &&
+	    (dev->class != PCI_CLASS_SYSTEM_PIC_IOXAPIC))
+	{
+		pci_dev_put(dev);
+		return NULL;
+	}
+
+	return dev;
+}
+
+static int get_gsi_base(acpi_handle handle, u32 *gsi_base)
+{
+	acpi_status status;
+	int result = -1;
+	unsigned long gsb;
+	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
+	union acpi_object *obj;
+	void *table;
+
+	status = acpi_evaluate_integer(handle, "_GSB", NULL, &gsb);
+	if (ACPI_SUCCESS(status)) {
+		*gsi_base = (u32)gsb;
+		return 0;
+	}
+
+	status = acpi_evaluate_object(handle, "_MAT", NULL, &buffer);
+	if (ACPI_FAILURE(status) || !buffer.length || !buffer.pointer)
+		return -1;
+
+	obj = buffer.pointer;
+	if (obj->type != ACPI_TYPE_BUFFER)
+		goto out;
+
+	table = obj->buffer.pointer;
+	switch (((acpi_table_entry_header *)table)->type) {
+	case ACPI_MADT_IOSAPIC:
+		*gsi_base = ((struct acpi_table_iosapic *)table)->global_irq_base;
+		result = 0;
+		break;
+	case ACPI_MADT_IOAPIC:
+		*gsi_base = ((struct acpi_table_ioapic *)table)->global_irq_base;
+		result = 0;
+		break;
+	default:
+		break;
+	}
+ out:
+	acpi_os_free(buffer.pointer);
+	return result;
+}
+
+static acpi_status
+ioapic_add(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+	acpi_status status;
+	unsigned long sta;
+	acpi_handle tmp;
+	struct pci_dev *pdev;
+	u32 gsi_base;
+	u64 phys_addr;
+
+	/* Evaluate _STA if present */
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &sta);
+	if (ACPI_SUCCESS(status) && sta != ACPI_STA_ALL)
+		return AE_CTRL_DEPTH;
+
+	/* Scan only PCI bus scope */
+	status = acpi_get_handle(handle, "_HID", &tmp);
+	if (ACPI_SUCCESS(status))
+		return AE_CTRL_DEPTH;
+
+	if (get_gsi_base(handle, &gsi_base))
+		return AE_OK;
+
+	pdev = get_apic_pci_info(handle);
+	if (!pdev)
+		return AE_OK;
+
+	if (pci_enable_device(pdev)) {
+		pci_dev_put(pdev);
+		return AE_OK;
+	}
+
+	pci_set_master(pdev);
+
+	if (pci_request_region(pdev, 0, "I/O APIC(acpiphp)")) {
+		pci_disable_device(pdev);
+		pci_dev_put(pdev);
+		return AE_OK;
+	}
+
+	phys_addr = pci_resource_start(pdev, 0);
+	if (acpi_register_ioapic(handle, phys_addr, gsi_base)) {
+		pci_release_region(pdev, 0);
+		pci_disable_device(pdev);
+		pci_dev_put(pdev);
+		return AE_OK;
+	}
+
+	return AE_OK;
+}
+
+static int acpiphp_configure_ioapics(acpi_handle handle)
+{
+	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
+			    ACPI_UINT32_MAX, ioapic_add, NULL, NULL);
+	return 0;
+}
+
 static int power_on_slot(struct acpiphp_slot *slot)
 {
 	acpi_status status;
@@ -942,6 +1068,7 @@ static int acpiphp_configure_bridge (acpi_handle handle)
 	acpiphp_sanitize_bus(bus);
 	acpiphp_set_hpp_values(handle, bus);
 	pci_enable_bridges(bus);
+	acpiphp_configure_ioapics(handle);
 	return 0;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bf608808a60c..810bbbcee404 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -62,6 +62,8 @@
 
 #define PCI_BASE_CLASS_SYSTEM		0x08
 #define PCI_CLASS_SYSTEM_PIC		0x0800
+#define PCI_CLASS_SYSTEM_PIC_IOAPIC	0x080010
+#define PCI_CLASS_SYSTEM_PIC_IOXAPIC	0x080020
 #define PCI_CLASS_SYSTEM_DMA		0x0801
 #define PCI_CLASS_SYSTEM_TIMER		0x0802
 #define PCI_CLASS_SYSTEM_RTC		0x0803
-- 
cgit v1.2.3-59-g8ed1b


From 2311b1f2bbd36fa5f366a7448c718b2556e0f02c Mon Sep 17 00:00:00 2001
From: Michael Ellerman <michael@ellerman.id.au>
Date: Fri, 13 May 2005 17:44:10 +1000
Subject: [PATCH] PCI: fix-pci-mmap-on-ppc-and-ppc64.patch

This is an updated version of Ben's fix-pci-mmap-on-ppc-and-ppc64.patch
which is in 2.6.12-rc4-mm1.

It fixes the patch to work on PPC iSeries, removes some debug printks
at Ben's request, and incorporates your
fix-pci-mmap-on-ppc-and-ppc64-fix.patch also.

Originally from Benjamin Herrenschmidt <benh@kernel.crashing.org>

This patch was discussed at length on linux-pci and so far, the last
iteration of it didn't raise any comment.  It's effect is a nop on
architecture that don't define the new pci_resource_to_user() callback
anyway.  It allows architecture like ppc who put weird things inside of
PCI resource structures to convert to some different value for user
visible ones.  It also fixes mmap'ing of IO space on those archs.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <michael@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/ppc/kernel/pci.c   | 21 +++++++++++++++++++--
 arch/ppc64/kernel/pci.c | 22 ++++++++++++++++++++--
 drivers/pci/pci-sysfs.c | 26 +++++++++++++++++++++-----
 drivers/pci/proc.c      | 14 ++++++++++----
 include/asm-ppc/pci.h   |  6 ++++++
 include/asm-ppc64/pci.h |  7 +++++++
 include/linux/pci.h     | 14 ++++++++++++++
 7 files changed, 97 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/kernel/pci.c b/arch/ppc/kernel/pci.c
index 6d7b92d72458..70cfb6ffd877 100644
--- a/arch/ppc/kernel/pci.c
+++ b/arch/ppc/kernel/pci.c
@@ -1495,7 +1495,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
 		*offset += hose->pci_mem_offset;
 		res_bit = IORESOURCE_MEM;
 	} else {
-		io_offset = (unsigned long)hose->io_base_virt;
+		io_offset = hose->io_base_virt - ___IO_BASE;
 		*offset += io_offset;
 		res_bit = IORESOURCE_IO;
 	}
@@ -1522,7 +1522,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
 
 		/* found it! construct the final physical address */
 		if (mmap_state == pci_mmap_io)
-			*offset += hose->io_base_phys - _IO_BASE;
+			*offset += hose->io_base_phys - io_offset;
 		return rp;
 	}
 
@@ -1739,6 +1739,23 @@ long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn)
 	return result;
 }
 
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+			  const struct resource *rsrc,
+			  u64 *start, u64 *end)
+{
+	struct pci_controller *hose = pci_bus_to_hose(dev->bus->number);
+	unsigned long offset = 0;
+
+	if (hose == NULL)
+		return;
+
+	if (rsrc->flags & IORESOURCE_IO)
+		offset = ___IO_BASE - hose->io_base_virt + hose->io_base_phys;
+
+	*start = rsrc->start + offset;
+	*end = rsrc->end + offset;
+}
+
 void __init
 pci_init_resource(struct resource *res, unsigned long start, unsigned long end,
 		  int flags, char *name)
diff --git a/arch/ppc64/kernel/pci.c b/arch/ppc64/kernel/pci.c
index 580676f87d23..ae6f579d3fa0 100644
--- a/arch/ppc64/kernel/pci.c
+++ b/arch/ppc64/kernel/pci.c
@@ -351,7 +351,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
 		*offset += hose->pci_mem_offset;
 		res_bit = IORESOURCE_MEM;
 	} else {
-		io_offset = (unsigned long)hose->io_base_virt;
+		io_offset = (unsigned long)hose->io_base_virt - pci_io_base;
 		*offset += io_offset;
 		res_bit = IORESOURCE_IO;
 	}
@@ -378,7 +378,7 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
 
 		/* found it! construct the final physical address */
 		if (mmap_state == pci_mmap_io)
-			*offset += hose->io_base_phys - io_offset;
+		       	*offset += hose->io_base_phys - io_offset;
 		return rp;
 	}
 
@@ -944,4 +944,22 @@ int pci_read_irq_line(struct pci_dev *pci_dev)
 }
 EXPORT_SYMBOL(pci_read_irq_line);
 
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+			  const struct resource *rsrc,
+			  u64 *start, u64 *end)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	unsigned long offset = 0;
+
+	if (hose == NULL)
+		return;
+
+	if (rsrc->flags & IORESOURCE_IO)
+		offset = pci_io_base - (unsigned long)hose->io_base_virt +
+			hose->io_base_phys;
+
+	*start = rsrc->start + offset;
+	*end = rsrc->end + offset;
+}
+
 #endif /* CONFIG_PPC_MULTIPLATFORM */
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index a15f94072a6f..cc9d65388e62 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -60,15 +60,18 @@ resource_show(struct device * dev, struct device_attribute *attr, char * buf)
 	char * str = buf;
 	int i;
 	int max = 7;
+	u64 start, end;
 
 	if (pci_dev->subordinate)
 		max = DEVICE_COUNT_RESOURCE;
 
 	for (i = 0; i < max; i++) {
-		str += sprintf(str,"0x%016lx 0x%016lx 0x%016lx\n",
-			       pci_resource_start(pci_dev,i),
-			       pci_resource_end(pci_dev,i),
-			       pci_resource_flags(pci_dev,i));
+		struct resource *res =  &pci_dev->resource[i];
+		pci_resource_to_user(pci_dev, i, res, &start, &end);
+		str += sprintf(str,"0x%016llx 0x%016llx 0x%016llx\n",
+			       (unsigned long long)start,
+			       (unsigned long long)end,
+			       (unsigned long long)res->flags);
 	}
 	return (str - buf);
 }
@@ -313,8 +316,21 @@ pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr,
 						       struct device, kobj));
 	struct resource *res = (struct resource *)attr->private;
 	enum pci_mmap_state mmap_type;
+	u64 start, end;
+	int i;
 
-	vma->vm_pgoff += res->start >> PAGE_SHIFT;
+	for (i = 0; i < PCI_ROM_RESOURCE; i++)
+		if (res == &pdev->resource[i])
+			break;
+	if (i >= PCI_ROM_RESOURCE)
+		return -ENODEV;
+
+	/* pci_mmap_page_range() expects the same kind of entry as coming
+	 * from /proc/bus/pci/ which is a "user visible" value. If this is
+	 * different from the resource itself, arch will do necessary fixup.
+	 */
+	pci_resource_to_user(pdev, i, res, &start, &end);
+	vma->vm_pgoff += start >> PAGE_SHIFT;
 	mmap_type = res->flags & IORESOURCE_MEM ? pci_mmap_mem : pci_mmap_io;
 
 	return pci_mmap_page_range(pdev, vma, mmap_type, 0);
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index e68bbfb1e7c3..7988fc8df3fd 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -355,14 +355,20 @@ static int show_device(struct seq_file *m, void *v)
 			dev->device,
 			dev->irq);
 	/* Here should be 7 and not PCI_NUM_RESOURCES as we need to preserve compatibility */
-	for(i=0; i<7; i++)
+	for (i=0; i<7; i++) {
+		u64 start, end;
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
 		seq_printf(m, LONG_FORMAT,
-			dev->resource[i].start |
+			((unsigned long)start) |
 			(dev->resource[i].flags & PCI_REGION_FLAG_MASK));
-	for(i=0; i<7; i++)
+	}
+	for (i=0; i<7; i++) {
+		u64 start, end;
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
 		seq_printf(m, LONG_FORMAT,
 			dev->resource[i].start < dev->resource[i].end ?
-			dev->resource[i].end - dev->resource[i].start + 1 : 0);
+			(unsigned long)(end - start) + 1 : 0);
+	}
 	seq_putc(m, '\t');
 	if (drv)
 		seq_printf(m, "%s", drv->name);
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index ce5ae6d048f5..002e7b305777 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -103,6 +103,12 @@ extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long size,
 					 pgprot_t prot);
 
+#define HAVE_ARCH_PCI_RESOURCE_TO_USER
+extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
+				 const struct resource *rsrc,
+				 u64 *start, u64 *end);
+
+
 #endif	/* __KERNEL__ */
 
 #endif /* __PPC_PCI_H */
diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h
index 6cd593f660a0..411bf5dee394 100644
--- a/include/asm-ppc64/pci.h
+++ b/include/asm-ppc64/pci.h
@@ -136,6 +136,13 @@ extern pgprot_t	pci_phys_mem_access_prot(struct file *file,
 					 unsigned long size,
 					 pgprot_t prot);
 
+#ifdef CONFIG_PPC_MULTIPLATFORM
+#define HAVE_ARCH_PCI_RESOURCE_TO_USER
+extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
+				 const struct resource *rsrc,
+				 u64 *start, u64 *end);
+#endif /* CONFIG_PPC_MULTIPLATFORM */
+
 
 #endif	/* __KERNEL__ */
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 0e9844929fe3..cfa1455848f4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1020,6 +1020,20 @@ static inline char *pci_name(struct pci_dev *pdev)
 #define pci_pretty_name(dev) ""
 #endif
 
+
+/* Some archs don't want to expose struct resource to userland as-is
+ * in sysfs and /proc
+ */
+#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
+static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
+                const struct resource *rsrc, u64 *start, u64 *end)
+{
+	*start = rsrc->start;
+	*end = rsrc->end;
+}
+#endif /* HAVE_ARCH_PCI_RESOURCE_TO_USER */
+
+
 /*
  *  The world is not perfect and supplies us with broken PCI devices.
  *  For at least a part of these bugs we need a work-around, so both
-- 
cgit v1.2.3-59-g8ed1b


From e24c2d963a604d9eaa560c90371fa387d3eec8f1 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 2 Jun 2005 12:55:50 -0700
Subject: [PATCH] PCI: DMA bursting advice

After seeing, at best, "guesses" as to the following kind
of information in several drivers, I decided that we really
need a way for platforms to specifically give advice in this
area for what works best with their PCI controller implementation.

Basically, this new interface gives DMA bursting advice on
PCI.  There are three forms of the advice:

1) Burst as much as possible, it is not necessary to end bursts
   on some particular boundary for best performance.

2) Burst on some byte count multiple.  A DMA burst to some multiple of
   number of bytes may be done, but it is important to end the burst
   on an exact multiple for best performance.

   The best example of this I am aware of are the PPC64 PCI
   controllers, where if you end a burst mid-cacheline then
   chip has to refetch the data and the IOMMU translations
   which hurts performance a lot.

3) Burst on a single byte count multiple.  Bursts shall end
   exactly on the next multiple boundary for best performance.

   Sparc64 and Alpha's PCI controllers operate this way.  They
   disconnect any device which tries to burst across a cacheline
   boundary.

   Actually, newer sparc64 PCI controllers do not have this behavior.
   That is why the "pdev" is passed into the interface, so I can
   add code later to check which PCI controller the system is using
   and give advice accordingly.

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/asm-alpha/pci.h   | 17 +++++++++++++++++
 include/asm-arm/pci.h     |  8 ++++++++
 include/asm-frv/pci.h     |  8 ++++++++
 include/asm-i386/pci.h    |  8 ++++++++
 include/asm-ia64/pci.h    | 17 +++++++++++++++++
 include/asm-mips/pci.h    |  8 ++++++++
 include/asm-parisc/pci.h  | 17 +++++++++++++++++
 include/asm-ppc/pci.h     |  8 ++++++++
 include/asm-ppc64/pci.h   | 17 +++++++++++++++++
 include/asm-sh/pci.h      |  8 ++++++++
 include/asm-sh64/pci.h    |  8 ++++++++
 include/asm-sparc/pci.h   |  8 ++++++++
 include/asm-sparc64/pci.h | 17 +++++++++++++++++
 include/asm-v850/pci.h    |  8 ++++++++
 include/asm-x86_64/pci.h  |  8 ++++++++
 include/linux/pci.h       |  9 +++++++++
 16 files changed, 174 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index 0c7b57bc043a..6c71dc1ad4ca 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -223,6 +223,23 @@ pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr,
 	/* Nothing to do. */
 }
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+	if (byte == 0)
+		cacheline_size = 1024;
+	else
+		cacheline_size = (int) byte * 4;
+
+	*strat = PCI_DMA_BURST_BOUNDARY;
+	*strategy_parameter = cacheline_size;
+}
+
 /* TODO: integrate with include/asm-generic/pci.h ? */
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
 {
diff --git a/include/asm-arm/pci.h b/include/asm-arm/pci.h
index 40ffaefbeb1a..bc2ec425aca5 100644
--- a/include/asm-arm/pci.h
+++ b/include/asm-arm/pci.h
@@ -42,6 +42,14 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define pci_unmap_len(PTR, LEN_NAME)		((PTR)->LEN_NAME)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	(((PTR)->LEN_NAME) = (VAL))
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
                                enum pci_mmap_state mmap_state, int write_combine);
diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h
index a6a469231f62..13427240664f 100644
--- a/include/asm-frv/pci.h
+++ b/include/asm-frv/pci.h
@@ -57,6 +57,14 @@ extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
  */
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 /*
  *	These are pretty much arbitary with the CoMEM implementation.
  *	We have the whole address space to ourselves.
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index fb749b85a739..bf07b3af85e3 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -99,6 +99,14 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
 }
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 #endif /* __KERNEL__ */
 
 /* implement the pci_ DMA API in terms of the generic device dma_ one */
diff --git a/include/asm-ia64/pci.h b/include/asm-ia64/pci.h
index a8314ee4e7d2..c9f1ab4e477d 100644
--- a/include/asm-ia64/pci.h
+++ b/include/asm-ia64/pci.h
@@ -82,6 +82,23 @@ extern int pcibios_prep_mwi (struct pci_dev *);
 #define sg_dma_len(sg)		((sg)->dma_length)
 #define sg_dma_address(sg)	((sg)->dma_address)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+	if (byte == 0)
+		cacheline_size = 1024;
+	else
+		cacheline_size = (int) byte * 4;
+
+	*strat = PCI_DMA_BURST_MULTIPLE;
+	*strategy_parameter = cacheline_size;
+}
+
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
 				enum pci_mmap_state mmap_state, int write_combine);
diff --git a/include/asm-mips/pci.h b/include/asm-mips/pci.h
index c9c576b48556..20b93bfa4565 100644
--- a/include/asm-mips/pci.h
+++ b/include/asm-mips/pci.h
@@ -130,6 +130,14 @@ extern void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev,
 extern void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev,
 	dma64_addr_t dma_addr, size_t len, int direction);
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 extern void pcibios_resource_to_bus(struct pci_dev *dev,
 	struct pci_bus_region *region, struct resource *res);
 
diff --git a/include/asm-parisc/pci.h b/include/asm-parisc/pci.h
index 0763c2982fb0..f9f5bf90111d 100644
--- a/include/asm-parisc/pci.h
+++ b/include/asm-parisc/pci.h
@@ -230,6 +230,23 @@ extern inline void pcibios_register_hba(struct pci_hba_data *x)
 /* export the pci_ DMA API in terms of the dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+	if (byte == 0)
+		cacheline_size = 1024;
+	else
+		cacheline_size = (int) byte * 4;
+
+	*strat = PCI_DMA_BURST_MULTIPLE;
+	*strategy_parameter = cacheline_size;
+}
+
 extern void
 pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
 			 struct resource *res);
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index 002e7b305777..669e9de7a525 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -69,6 +69,14 @@ extern unsigned long pci_bus_to_phys(unsigned int ba, int busnr);
 #define pci_unmap_len(PTR, LEN_NAME)		(0)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 /*
  * At present there are very few 32-bit PPC machines that can have
  * memory above the 4GB point, and we don't support that.
diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h
index 411bf5dee394..20beb10c0902 100644
--- a/include/asm-ppc64/pci.h
+++ b/include/asm-ppc64/pci.h
@@ -78,6 +78,23 @@ static inline int pci_dac_dma_supported(struct pci_dev *hwdev,u64 mask)
 	return 0;
 }
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+	if (byte == 0)
+		cacheline_size = 1024;
+	else
+		cacheline_size = (int) byte * 4;
+
+	*strat = PCI_DMA_BURST_MULTIPLE;
+	*strategy_parameter = cacheline_size;
+}
+
 extern int pci_domain_nr(struct pci_bus *bus);
 
 /* Decide whether to display the domain number in /proc */
diff --git a/include/asm-sh/pci.h b/include/asm-sh/pci.h
index 9c3b63d0105e..7237bc6a7280 100644
--- a/include/asm-sh/pci.h
+++ b/include/asm-sh/pci.h
@@ -96,6 +96,14 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define sg_dma_address(sg)	(virt_to_bus((sg)->dma_address))
 #define sg_dma_len(sg)		((sg)->length)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 /* Board-specific fixup routines. */
 extern void pcibios_fixup(void);
 extern void pcibios_fixup_irqs(void);
diff --git a/include/asm-sh64/pci.h b/include/asm-sh64/pci.h
index 8cc14e139750..0ac15ab01cce 100644
--- a/include/asm-sh64/pci.h
+++ b/include/asm-sh64/pci.h
@@ -86,6 +86,14 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define sg_dma_address(sg)	((sg)->dma_address)
 #define sg_dma_len(sg)		((sg)->length)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 /* Board-specific fixup routines. */
 extern void pcibios_fixup(void);
 extern void pcibios_fixup_irqs(void);
diff --git a/include/asm-sparc/pci.h b/include/asm-sparc/pci.h
index d200a25a7373..2fd65db95e92 100644
--- a/include/asm-sparc/pci.h
+++ b/include/asm-sparc/pci.h
@@ -144,6 +144,14 @@ extern inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
 
 #define pci_dac_dma_supported(dev, mask)	(0)
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
 }
diff --git a/include/asm-sparc64/pci.h b/include/asm-sparc64/pci.h
index 2a0c85cd1c11..402667300d01 100644
--- a/include/asm-sparc64/pci.h
+++ b/include/asm-sparc64/pci.h
@@ -220,6 +220,23 @@ static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
 	return (dma_addr == PCI_DMA_ERROR_CODE);
 }
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+	if (byte == 0)
+		cacheline_size = 1024;
+	else
+		cacheline_size = (int) byte * 4;
+
+	*strat = PCI_DMA_BURST_BOUNDARY;
+	*strategy_parameter = cacheline_size;
+}
+
 /* Return the index of the PCI controller for device PDEV. */
 
 extern int pci_domain_nr(struct pci_bus *bus);
diff --git a/include/asm-v850/pci.h b/include/asm-v850/pci.h
index e41941447b49..d26eb8d67311 100644
--- a/include/asm-v850/pci.h
+++ b/include/asm-v850/pci.h
@@ -81,6 +81,14 @@ extern void
 pci_free_consistent (struct pci_dev *pdev, size_t size, void *cpu_addr,
 		     dma_addr_t dma_addr);
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
 }
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 8712520ca47f..8461d6af102e 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -123,6 +123,14 @@ pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr,
 	flush_write_buffers();
 }
 
+static inline void pci_dma_burst_advice(struct pci_dev *pdev,
+					enum pci_dma_burst_strategy *strat,
+					unsigned long *strategy_parameter)
+{
+	*strat = PCI_DMA_BURST_INFINITY;
+	*strategy_parameter = ~0UL;
+}
+
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       enum pci_mmap_state mmap_state, int write_combine);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cfa1455848f4..9ce4f1be093f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -874,6 +874,15 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass
 #define	pci_pool_alloc(pool, flags, handle) dma_pool_alloc(pool, flags, handle)
 #define	pci_pool_free(pool, vaddr, addr) dma_pool_free(pool, vaddr, addr)
 
+enum pci_dma_burst_strategy {
+	PCI_DMA_BURST_INFINITY,	/* make bursts as large as possible,
+				   strategy_parameter is N/A */
+	PCI_DMA_BURST_BOUNDARY, /* disconnect at every strategy_parameter
+				   byte boundaries */
+	PCI_DMA_BURST_MULTIPLE, /* disconnect at some multiple of
+				   strategy_parameter byte boundaries */
+};
+
 #if defined(CONFIG_ISA) || defined(CONFIG_EISA)
 extern struct pci_dev *isa_bridge;
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From bb4a61b6eaee01707f24deeefc5d7136f25f75c5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 6 Jun 2005 23:07:46 -0700
Subject: [PATCH] PCI: fix up errors after dma bursting patch and CONFIG_PCI=n

With CONFIG_PCI=n:

In file included from include/linux/pci.h:917,
                 from lib/iomap.c:6:
include/asm/pci.h:104: warning: `enum pci_dma_burst_strategy' declared inside parameter list
include/asm/pci.h:104: warning: its scope is only this definition or declaration, which is probably not what you want.
include/asm/pci.h: In function `pci_dma_burst_advice':
include/asm/pci.h:106: dereferencing pointer to incomplete type
include/asm/pci.h:106: `PCI_DMA_BURST_INFINITY' undeclared (first use in this function)
include/asm/pci.h:106: (Each undeclared identifier is reported only once
include/asm/pci.h:106: for each function it appears in.)
make[1]: *** [lib/iomap.o] Error 1

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/asm-alpha/pci.h   | 2 ++
 include/asm-arm/pci.h     | 2 ++
 include/asm-frv/pci.h     | 2 ++
 include/asm-i386/pci.h    | 2 ++
 include/asm-ia64/pci.h    | 2 ++
 include/asm-mips/pci.h    | 2 ++
 include/asm-parisc/pci.h  | 2 ++
 include/asm-ppc/pci.h     | 2 ++
 include/asm-ppc64/pci.h   | 2 ++
 include/asm-sh/pci.h      | 2 ++
 include/asm-sh64/pci.h    | 2 ++
 include/asm-sparc/pci.h   | 2 ++
 include/asm-sparc64/pci.h | 2 ++
 include/asm-v850/pci.h    | 2 ++
 include/asm-x86_64/pci.h  | 2 ++
 include/linux/pci.h       | 2 ++
 16 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index 6c71dc1ad4ca..b7806aa3785c 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -223,6 +223,7 @@ pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr,
 	/* Nothing to do. */
 }
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -239,6 +240,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_BOUNDARY;
 	*strategy_parameter = cacheline_size;
 }
+#endif
 
 /* TODO: integrate with include/asm-generic/pci.h ? */
 static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
diff --git a/include/asm-arm/pci.h b/include/asm-arm/pci.h
index bc2ec425aca5..e300646fe650 100644
--- a/include/asm-arm/pci.h
+++ b/include/asm-arm/pci.h
@@ -42,6 +42,7 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define pci_unmap_len(PTR, LEN_NAME)		((PTR)->LEN_NAME)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	(((PTR)->LEN_NAME) = (VAL))
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -49,6 +50,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/include/asm-frv/pci.h b/include/asm-frv/pci.h
index 13427240664f..b4efe5e3591a 100644
--- a/include/asm-frv/pci.h
+++ b/include/asm-frv/pci.h
@@ -57,6 +57,7 @@ extern void pci_free_consistent(struct pci_dev *hwdev, size_t size,
  */
 #define PCI_DMA_BUS_IS_PHYS	(1)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -64,6 +65,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 /*
  *	These are pretty much arbitary with the CoMEM implementation.
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index bf07b3af85e3..3561899eb826 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -99,6 +99,7 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
 }
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -106,6 +107,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/include/asm-ia64/pci.h b/include/asm-ia64/pci.h
index c9f1ab4e477d..0c4c5d801d3f 100644
--- a/include/asm-ia64/pci.h
+++ b/include/asm-ia64/pci.h
@@ -82,6 +82,7 @@ extern int pcibios_prep_mwi (struct pci_dev *);
 #define sg_dma_len(sg)		((sg)->dma_length)
 #define sg_dma_address(sg)	((sg)->dma_address)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -98,6 +99,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_MULTIPLE;
 	*strategy_parameter = cacheline_size;
 }
+#endif
 
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range (struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/include/asm-mips/pci.h b/include/asm-mips/pci.h
index 20b93bfa4565..2d323b6e147d 100644
--- a/include/asm-mips/pci.h
+++ b/include/asm-mips/pci.h
@@ -130,6 +130,7 @@ extern void pci_dac_dma_sync_single_for_cpu(struct pci_dev *pdev,
 extern void pci_dac_dma_sync_single_for_device(struct pci_dev *pdev,
 	dma64_addr_t dma_addr, size_t len, int direction);
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -137,6 +138,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 extern void pcibios_resource_to_bus(struct pci_dev *dev,
 	struct pci_bus_region *region, struct resource *res);
diff --git a/include/asm-parisc/pci.h b/include/asm-parisc/pci.h
index f9f5bf90111d..ee741c150176 100644
--- a/include/asm-parisc/pci.h
+++ b/include/asm-parisc/pci.h
@@ -230,6 +230,7 @@ extern inline void pcibios_register_hba(struct pci_hba_data *x)
 /* export the pci_ DMA API in terms of the dma_ one */
 #include <asm-generic/pci-dma-compat.h>
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -246,6 +247,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_MULTIPLE;
 	*strategy_parameter = cacheline_size;
 }
+#endif
 
 extern void
 pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index 669e9de7a525..db0a2a0ec74d 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -69,6 +69,7 @@ extern unsigned long pci_bus_to_phys(unsigned int ba, int busnr);
 #define pci_unmap_len(PTR, LEN_NAME)		(0)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -76,6 +77,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 /*
  * At present there are very few 32-bit PPC machines that can have
diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h
index 20beb10c0902..d12dfce21e20 100644
--- a/include/asm-ppc64/pci.h
+++ b/include/asm-ppc64/pci.h
@@ -78,6 +78,7 @@ static inline int pci_dac_dma_supported(struct pci_dev *hwdev,u64 mask)
 	return 0;
 }
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -94,6 +95,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_MULTIPLE;
 	*strategy_parameter = cacheline_size;
 }
+#endif
 
 extern int pci_domain_nr(struct pci_bus *bus);
 
diff --git a/include/asm-sh/pci.h b/include/asm-sh/pci.h
index 7237bc6a7280..26044889c770 100644
--- a/include/asm-sh/pci.h
+++ b/include/asm-sh/pci.h
@@ -96,6 +96,7 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define sg_dma_address(sg)	(virt_to_bus((sg)->dma_address))
 #define sg_dma_len(sg)		((sg)->length)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -103,6 +104,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 /* Board-specific fixup routines. */
 extern void pcibios_fixup(void);
diff --git a/include/asm-sh64/pci.h b/include/asm-sh64/pci.h
index 0ac15ab01cce..c68870e02d91 100644
--- a/include/asm-sh64/pci.h
+++ b/include/asm-sh64/pci.h
@@ -86,6 +86,7 @@ static inline void pcibios_penalize_isa_irq(int irq)
 #define sg_dma_address(sg)	((sg)->dma_address)
 #define sg_dma_len(sg)		((sg)->length)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -93,6 +94,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 /* Board-specific fixup routines. */
 extern void pcibios_fixup(void);
diff --git a/include/asm-sparc/pci.h b/include/asm-sparc/pci.h
index 2fd65db95e92..44bb38758c96 100644
--- a/include/asm-sparc/pci.h
+++ b/include/asm-sparc/pci.h
@@ -144,6 +144,7 @@ extern inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask)
 
 #define pci_dac_dma_supported(dev, mask)	(0)
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -151,6 +152,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
diff --git a/include/asm-sparc64/pci.h b/include/asm-sparc64/pci.h
index 402667300d01..84e41c1ef3f8 100644
--- a/include/asm-sparc64/pci.h
+++ b/include/asm-sparc64/pci.h
@@ -220,6 +220,7 @@ static inline int pci_dma_mapping_error(dma_addr_t dma_addr)
 	return (dma_addr == PCI_DMA_ERROR_CODE);
 }
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -236,6 +237,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_BOUNDARY;
 	*strategy_parameter = cacheline_size;
 }
+#endif
 
 /* Return the index of the PCI controller for device PDEV. */
 
diff --git a/include/asm-v850/pci.h b/include/asm-v850/pci.h
index d26eb8d67311..8e79be0fe99d 100644
--- a/include/asm-v850/pci.h
+++ b/include/asm-v850/pci.h
@@ -81,6 +81,7 @@ extern void
 pci_free_consistent (struct pci_dev *pdev, size_t size, void *cpu_addr,
 		     dma_addr_t dma_addr);
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -88,6 +89,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 {
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 8461d6af102e..c1961db88fac 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -123,6 +123,7 @@ pci_dac_dma_sync_single_for_device(struct pci_dev *pdev, dma64_addr_t dma_addr,
 	flush_write_buffers();
 }
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -130,6 +131,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 #define HAVE_PCI_MMAP
 extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9ce4f1be093f..66798b46f308 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -985,6 +985,8 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 }
 #endif
 
+#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
+
 #endif /* !CONFIG_PCI */
 
 /* these helpers provide future and backwards compatibility
-- 
cgit v1.2.3-59-g8ed1b


From 545493917dc90298e1c38f018ad893f5518928e7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 23 Jun 2005 17:35:56 -0700
Subject: [PATCH] PCI: add proper MCFG table parsing to ACPI core.

This patch is the first step in properly handling the MCFG PCI table.
It defines the structures properly, and saves off the table so that the
pci mmconfig code can access it.  It moves the parsing of the table a
little later in the boot process, but still before the information is
needed.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/kernel/acpi/boot.c | 41 +++++++++++++++++++++++++++++++++--------
 arch/i386/pci/mmconfig.c     | 12 +++++++-----
 arch/x86_64/pci/mmconfig.c   | 16 +++++++++-------
 include/linux/acpi.h         | 16 +++++++++++++---
 4 files changed, 62 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index f5360d88bcf4..b7808a89d945 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -159,9 +159,15 @@ char *__acpi_map_table(unsigned long phys, unsigned long size)
 #endif
 
 #ifdef CONFIG_PCI_MMCONFIG
-static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
+/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
+struct acpi_table_mcfg_config *pci_mmcfg_config;
+int pci_mmcfg_config_num;
+
+int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
 {
 	struct acpi_table_mcfg *mcfg;
+	unsigned long i;
+	int config_size;
 
 	if (!phys_addr || !size)
 		return -EINVAL;
@@ -172,18 +178,38 @@ static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
 		return -ENODEV;
 	}
 
-	if (mcfg->base_reserved) {
-		printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n");
+	/* how many config structures do we have */
+	pci_mmcfg_config_num = 0;
+	i = size - sizeof(struct acpi_table_mcfg);
+	while (i >= sizeof(struct acpi_table_mcfg_config)) {
+		++pci_mmcfg_config_num;
+		i -= sizeof(struct acpi_table_mcfg_config);
+	};
+	if (pci_mmcfg_config_num == 0) {
+		printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
 		return -ENODEV;
 	}
 
-	pci_mmcfg_base_addr = mcfg->base_address;
+	config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config);
+	pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL);
+	if (!pci_mmcfg_config) {
+		printk(KERN_WARNING PREFIX
+		       "No memory for MCFG config tables\n");
+		return -ENOMEM;
+	}
+
+	memcpy(pci_mmcfg_config, &mcfg->config, config_size);
+	for (i = 0; i < pci_mmcfg_config_num; ++i) {
+		if (mcfg->config[i].base_reserved) {
+			printk(KERN_ERR PREFIX
+			       "MMCONFIG not in low 4GB of memory\n");
+			return -ENODEV;
+		}
+	}
 
 	return 0;
 }
-#else
-#define	acpi_parse_mcfg NULL
-#endif /* !CONFIG_PCI_MMCONFIG */
+#endif /* CONFIG_PCI_MMCONFIG */
 
 #ifdef CONFIG_X86_LOCAL_APIC
 static int __init
@@ -1139,7 +1165,6 @@ int __init acpi_boot_init(void)
 	acpi_process_madt();
 
 	acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
-	acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
 
 	return 0;
 }
diff --git a/arch/i386/pci/mmconfig.c b/arch/i386/pci/mmconfig.c
index 021a50aa51f4..5fbaa9132258 100644
--- a/arch/i386/pci/mmconfig.c
+++ b/arch/i386/pci/mmconfig.c
@@ -11,11 +11,9 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/acpi.h>
 #include "pci.h"
 
-/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-u32 pci_mmcfg_base_addr;
-
 #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
 
 /* The base address of the last MMCONFIG device accessed */
@@ -27,7 +25,7 @@ static u32 mmcfg_last_accessed_device;
 
 static inline void pci_exp_set_dev_base(int bus, int devfn)
 {
-	u32 dev_base = pci_mmcfg_base_addr | (bus << 20) | (devfn << 12);
+	u32 dev_base = pci_mmcfg_config[0].base_address | (bus << 20) | (devfn << 12);
 	if (dev_base != mmcfg_last_accessed_device) {
 		mmcfg_last_accessed_device = dev_base;
 		set_fixmap_nocache(FIX_PCIE_MCFG, dev_base);
@@ -101,7 +99,11 @@ static int __init pci_mmcfg_init(void)
 {
 	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
 		goto out;
-	if (!pci_mmcfg_base_addr)
+
+	acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].base_address == 0))
 		goto out;
 
 	/* Kludge for now. Don't use mmconfig on AMD systems because
diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c
index b693c232fd07..09cfcc1234b9 100644
--- a/arch/x86_64/pci/mmconfig.c
+++ b/arch/x86_64/pci/mmconfig.c
@@ -7,15 +7,13 @@
 
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/acpi.h>
 #include "pci.h"
 
 #define MMCONFIG_APER_SIZE (256*1024*1024)
 
-/* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
-u32 pci_mmcfg_base_addr;
-
 /* Static virtual mapping of the MMCONFIG aperture */
-char *pci_mmcfg_virt;
+static char *pci_mmcfg_virt;
 
 static inline char *pci_dev_base(unsigned int bus, unsigned int devfn)
 {
@@ -77,7 +75,11 @@ static int __init pci_mmcfg_init(void)
 {
 	if ((pci_probe & PCI_PROBE_MMCONF) == 0)
 		return 0;
-	if (!pci_mmcfg_base_addr)
+
+	acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
+	if ((pci_mmcfg_config_num == 0) ||
+	    (pci_mmcfg_config == NULL) ||
+	    (pci_mmcfg_config[0].base_address == 0))
 		return 0;
 
 	/* Kludge for now. Don't use mmconfig on AMD systems because
@@ -88,13 +90,13 @@ static int __init pci_mmcfg_init(void)
 		return 0; 
 
 	/* RED-PEN i386 doesn't do _nocache right now */
-	pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_base_addr, MMCONFIG_APER_SIZE);
+	pci_mmcfg_virt = ioremap_nocache(pci_mmcfg_config[0].base_address, MMCONFIG_APER_SIZE);
 	if (!pci_mmcfg_virt) { 
 		printk("PCI: Cannot map mmconfig aperture\n");
 		return 0;
 	}	
 
-	printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_base_addr);
+	printk(KERN_INFO "PCI: Using MMCONFIG at %x\n", pci_mmcfg_config[0].base_address);
 	raw_pci_ops = &pci_mmcfg;
 	pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index f5bc298707e1..ef8483673aa3 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -342,11 +342,19 @@ struct acpi_table_ecdt {
 
 /* PCI MMCONFIG */
 
+/* Defined in PCI Firmware Specification 3.0 */
+struct acpi_table_mcfg_config {
+	u32				base_address;
+	u32				base_reserved;
+	u16				pci_segment_group_number;
+	u8				start_bus_number;
+	u8				end_bus_number;
+	u8				reserved[4];
+} __attribute__ ((packed));
 struct acpi_table_mcfg {
 	struct acpi_table_header	header;
 	u8				reserved[8];
-	u32				base_address;
-	u32				base_reserved;
+	struct acpi_table_mcfg_config	config[0];
 } __attribute__ ((packed));
 
 /* Table Handlers */
@@ -391,6 +399,7 @@ int acpi_table_parse (enum acpi_table_id id, acpi_table_handler handler);
 int acpi_get_table_header_early (enum acpi_table_id id, struct acpi_table_header **header);
 int acpi_table_parse_madt (enum acpi_madt_entry_id id, acpi_madt_entry_handler handler, unsigned int max_entries);
 int acpi_table_parse_srat (enum acpi_srat_entry_id id, acpi_madt_entry_handler handler, unsigned int max_entries);
+int acpi_parse_mcfg (unsigned long phys_addr, unsigned long size);
 void acpi_table_print (struct acpi_table_header *header, unsigned long phys_addr);
 void acpi_table_print_madt_entry (acpi_table_entry_header *madt);
 void acpi_table_print_srat_entry (acpi_table_entry_header *srat);
@@ -412,7 +421,8 @@ int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base);
 
 extern int acpi_mp_config;
 
-extern u32 pci_mmcfg_base_addr;
+extern struct acpi_table_mcfg_config *pci_mmcfg_config;
+extern int pci_mmcfg_config_num;
 
 extern int sbf_port ;
 
-- 
cgit v1.2.3-59-g8ed1b


From b3563c4fbff906991a1b4ef4609f99cca2a0de6a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 12:54:43 -0700
Subject: [NETLINK]: Clear padding in netlink messages

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h   | 1 +
 include/linux/rtnetlink.h | 5 ++++-
 net/core/rtnetlink.c      | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 3029cad63a01..27e4d164a108 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -168,6 +168,7 @@ __nlmsg_put(struct sk_buff *skb, u32 pid, u32 seq, int type, int len, int flags)
 	nlh->nlmsg_flags = flags;
 	nlh->nlmsg_pid = pid;
 	nlh->nlmsg_seq = seq;
+	memset(NLMSG_DATA(nlh) + len, 0, NLMSG_ALIGN(size) - size);
 	return nlh;
 }
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index d021888b58f1..dc26e82ba0fd 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -898,7 +898,9 @@ extern void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const voi
 	memcpy(skb_put(skb, attrlen), data, attrlen); })
 
 #define RTA_PUT_NOHDR(skb, attrlen, data) \
-	RTA_APPEND(skb, RTA_ALIGN(attrlen), data)
+({	RTA_APPEND(skb, RTA_ALIGN(attrlen), data); \
+	memset(skb->tail - (RTA_ALIGN(attrlen) - attrlen), 0, \
+	       RTA_ALIGN(attrlen) - attrlen); })
 
 #define RTA_PUT_U8(skb, attrtype, value) \
 ({	u8 _tmp = (value); \
@@ -978,6 +980,7 @@ __rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
 	rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size));
 	rta->rta_type = attrtype;
 	rta->rta_len = size;
+	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 	return rta;
 }
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e013d836a7ab..879237c378f8 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -126,6 +126,7 @@ void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data
 	rta->rta_type = attrtype;
 	rta->rta_len = size;
 	memcpy(RTA_DATA(rta), data, attrlen);
+	memset(RTA_DATA(rta) + attrlen, 0, RTA_ALIGN(size) - size);
 }
 
 size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
-- 
cgit v1.2.3-59-g8ed1b


From 8a47077a0b5aa2649751c46e7a27884e6686ccbf Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 28 Jun 2005 12:56:45 -0700
Subject: [NETLINK]: Missing padding fields in dumped structures

Plug holes with padding fields and initialized them to zero.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_cls.h   | 1 +
 include/linux/pkt_sched.h | 9 ++++++---
 include/linux/rtnetlink.h | 5 +++++
 net/ipv6/addrconf.c       | 3 +++
 net/sched/cls_rsvp.h      | 1 +
 net/sched/sch_cbq.c       | 1 +
 6 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 25d2d67c1faf..bd2c5a2bbbf5 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -276,6 +276,7 @@ struct tc_rsvp_pinfo
 	__u8	protocol;
 	__u8	tunnelid;
 	__u8	tunnelhdr;
+	__u8	pad;
 };
 
 /* ROUTE filter */
diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 1d9da36eb9db..60ffcb9c5791 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -221,9 +221,11 @@ struct tc_gred_qopt
 /* gred setup */
 struct tc_gred_sopt
 {
-       __u32           DPs;
-       __u32           def_DP;
-       __u8            grio;
+       __u32		DPs;
+       __u32		def_DP;
+       __u8		grio;
+       __u8		pad1;
+       __u16		pad2;
 };
 
 /* HTB section */
@@ -351,6 +353,7 @@ struct tc_cbq_ovl
 #define	TC_CBQ_OVL_DROP		3
 #define	TC_CBQ_OVL_RCLASSIC	4
 	unsigned char	priority2;
+	__u16		pad;
 	__u32		penalty;
 };
 
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index dc26e82ba0fd..657c05ab8f9e 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -363,6 +363,8 @@ enum
 struct rta_session
 {
 	__u8	proto;
+	__u8	pad1;
+	__u16	pad2;
 
 	union {
 		struct {
@@ -635,10 +637,13 @@ struct ifinfomsg
 struct prefixmsg
 {
 	unsigned char	prefix_family;
+	unsigned char	prefix_pad1;
+	unsigned short	prefix_pad2;
 	int		prefix_ifindex;
 	unsigned char	prefix_type;
 	unsigned char	prefix_len;
 	unsigned char	prefix_flags;
+	unsigned char	prefix_pad3;
 };
 
 enum 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8140bed78a26..1b2902d8eb98 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3031,9 +3031,12 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
 	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*pmsg), flags);
 	pmsg = NLMSG_DATA(nlh);
 	pmsg->prefix_family = AF_INET6;
+	pmsg->prefix_pad1 = 0;
+	pmsg->prefix_pad2 = 0;
 	pmsg->prefix_ifindex = idev->dev->ifindex;
 	pmsg->prefix_len = pinfo->prefix_len;
 	pmsg->prefix_type = pinfo->type;
+	pmsg->prefix_pad3 = 0;
 	
 	pmsg->prefix_flags = 0;
 	if (pinfo->onlink)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 232fb9196810..006168d69376 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -618,6 +618,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
 	pinfo.protocol = s->protocol;
 	pinfo.tunnelid = s->tunnelid;
 	pinfo.tunnelhdr = f->tunnelhdr;
+	pinfo.pad = 0;
 	RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
 	if (f->res.classid)
 		RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index baeb3111f75e..09453f997d8c 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1528,6 +1528,7 @@ static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
 
 	opt.strategy = cl->ovl_strategy;
 	opt.priority2 = cl->priority2+1;
+	opt.pad = 0;
 	opt.penalty = (cl->penalty*1000)/HZ;
 	RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
 	return skb->len;
-- 
cgit v1.2.3-59-g8ed1b


From 2f85a42964dd43fed3a339701db046bee5a8b903 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vladislav.yasevich@hp.com>
Date: Tue, 28 Jun 2005 13:24:23 -0700
Subject: [SCTP] Make init & delayed sack timeouts configurable by user.

Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
Signed-off-by: Sridhar Samudrala <sri@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sysctl.h       |  1 +
 include/net/sctp/constants.h | 18 +++---------------
 include/net/sctp/structs.h   |  4 ++++
 net/sctp/endpointola.c       | 13 +++++--------
 net/sctp/protocol.c          |  5 ++++-
 net/sctp/sysctl.c            | 13 +++++++++++++
 net/sctp/transport.c         |  1 -
 7 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ebfe1250f0a4..5b5f434ac9a0 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -641,6 +641,7 @@ enum {
 	NET_SCTP_ADDIP_ENABLE		 = 13,
 	NET_SCTP_PRSCTP_ENABLE		 = 14,
 	NET_SCTP_SNDBUF_POLICY		 = 15,
+	NET_SCTP_SACK_TIMEOUT		 = 16,
 };
 
 /* /proc/sys/net/bridge */
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 4868c7f7749d..5999e5684bbf 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -263,23 +263,11 @@ enum { SCTP_MIN_PMTU = 576 };
 enum { SCTP_MAX_DUP_TSNS = 16 };
 enum { SCTP_MAX_GABS = 16 };
 
-/* Here we define the default timers.  */
+/* Heartbeat interval - 30 secs */
+#define SCTP_DEFAULT_TIMEOUT_HEARTBEAT	(30 * HZ)
 
-/* cookie timer def = ? seconds */
-#define SCTP_DEFAULT_TIMEOUT_T1_COOKIE	(3 * HZ)
-
-/* init timer def = 3 seconds  */
-#define SCTP_DEFAULT_TIMEOUT_T1_INIT	(3 * HZ)
-
-/* shutdown timer def = 300 ms */
-#define SCTP_DEFAULT_TIMEOUT_T2_SHUTDOWN ((300 * HZ) / 1000)
-
-/* 0 seconds + RTO */
-#define SCTP_DEFAULT_TIMEOUT_HEARTBEAT	(10 * HZ)
-
-/* recv timer def = 200ms (in usec) */
+/* Delayed sack timer - 200ms */
 #define SCTP_DEFAULT_TIMEOUT_SACK	((200 * HZ) / 1000)
-#define SCTP_DEFAULT_TIMEOUT_SACK_MAX	((500 * HZ) / 1000) /* 500 ms */
 
 /* RTO.Initial              - 3  seconds
  * RTO.Min                  - 1  second
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index dfad4d3c581c..47727c7cc628 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -161,6 +161,9 @@ extern struct sctp_globals {
 	 */
 	int sndbuf_policy;
 
+	/* Delayed SACK timeout  200ms default*/
+	int sack_timeout;
+
 	/* HB.interval		    - 30 seconds  */
 	int hb_interval;
 
@@ -217,6 +220,7 @@ extern struct sctp_globals {
 #define sctp_sndbuf_policy	 	(sctp_globals.sndbuf_policy)
 #define sctp_max_retrans_path		(sctp_globals.max_retrans_path)
 #define sctp_max_retrans_init		(sctp_globals.max_retrans_init)
+#define sctp_sack_timeout		(sctp_globals.sack_timeout)
 #define sctp_hb_interval		(sctp_globals.hb_interval)
 #define sctp_max_instreams		(sctp_globals.max_instreams)
 #define sctp_max_outstreams		(sctp_globals.max_outstreams)
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 2ec0320fac3b..c44bf4165c6e 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -102,9 +102,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 	/* Set up the base timeout information.  */
 	ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0;
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
-		SCTP_DEFAULT_TIMEOUT_T1_COOKIE;
+		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] =
-		SCTP_DEFAULT_TIMEOUT_T1_INIT;
+		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] =
 		msecs_to_jiffies(sp->rtoinfo.srto_initial);
 	ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0;
@@ -117,12 +117,9 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
         ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]
 		= 5 * msecs_to_jiffies(sp->rtoinfo.srto_max);
 
-	ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] =
-		SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
-	ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] =
-		SCTP_DEFAULT_TIMEOUT_SACK;
-	ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] =
-		sp->autoclose * HZ;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = sctp_sack_timeout;
+	ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ;
 
 	/* Use SCTP specific send buffer space queues.  */
 	ep->sndbuf_policy = sctp_sndbuf_policy;
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 5135e1a25d25..e7f37faba7c0 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1050,7 +1050,10 @@ SCTP_STATIC __init int sctp_init(void)
 	sctp_sndbuf_policy		= 0;
 
 	/* HB.interval              - 30 seconds */
-	sctp_hb_interval		= 30 * HZ;
+	sctp_hb_interval		= SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
+
+	/* delayed SACK timeout */
+	sctp_sack_timeout		= SCTP_DEFAULT_TIMEOUT_SACK;
 
 	/* Implementation specific variables. */
 
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 7fc31849312b..dc4893474f18 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -47,6 +47,8 @@
 static ctl_handler sctp_sysctl_jiffies_ms;
 static long rto_timer_min = 1;
 static long rto_timer_max = 86400000; /* One day */
+static long sack_timer_min = 1;
+static long sack_timer_max = 500;
 
 static ctl_table sctp_table[] = {
 	{
@@ -187,6 +189,17 @@ static ctl_table sctp_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= NET_SCTP_SACK_TIMEOUT,
+		.procname	= "sack_timeout",
+		.data		= &sctp_sack_timeout,
+		.maxlen		= sizeof(long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_ms_jiffies_minmax,
+		.strategy	= &sctp_sysctl_jiffies_ms,
+		.extra1         = &sack_timer_min,
+		.extra2         = &sack_timer_max,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 0ec0fde6e6c5..a63b69179607 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -103,7 +103,6 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 	/* Set up the heartbeat timer. */
 	init_timer(&peer->hb_timer);
-	peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT;
 	peer->hb_timer.function = sctp_generate_heartbeat_event;
 	peer->hb_timer.data = (unsigned long)peer;
 
-- 
cgit v1.2.3-59-g8ed1b


From 7fe40f73d7591b38f129fe6a9c0fa46e0b192d09 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Tue, 28 Jun 2005 15:46:24 -0700
Subject: [IPV6]: remove more unused IPV6_AUTHHDR things.

Remove two more unused IPV6_AUTHHDR option things,
which I failed to remove them last time,
plus, mark IPV6_AUTHHDR obsolete.

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/in6.h      | 2 +-
 include/net/ipv6.h       | 1 -
 net/ipv6/ip6_flowlabel.c | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/in6.h b/include/linux/in6.h
index f8256c582845..dcf5720ffcbb 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -156,7 +156,7 @@ struct in6_flowlabel_req
 #define IPV6_CHECKSUM		7
 #define IPV6_HOPLIMIT		8
 #define IPV6_NEXTHOP		9
-#define IPV6_AUTHHDR		10
+#define IPV6_AUTHHDR		10	/* obsolete */
 #define IPV6_FLOWINFO		11
 
 #define IPV6_UNICAST_HOPS	16
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 771b47e30f86..69324465e8b3 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -183,7 +183,6 @@ struct ipv6_txoptions
 	struct ipv6_opt_hdr	*hopopt;
 	struct ipv6_opt_hdr	*dst0opt;
 	struct ipv6_rt_hdr	*srcrt;	/* Routing Header */
-	struct ipv6_opt_hdr	*auth;
 	struct ipv6_opt_hdr	*dst1opt;
 
 	/* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 0e5f7499debb..b6c73da5ff35 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -244,7 +244,6 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
 		opt_space->opt_nflen = 0;
 	}
 	opt_space->dst1opt = fopt->dst1opt;
-	opt_space->auth = fopt->auth;
 	opt_space->opt_flen = fopt->opt_flen;
 	return opt_space;
 }
-- 
cgit v1.2.3-59-g8ed1b


From bcd61272db5e643b6d9c01c9d5085b914d9f19df Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arndb@de.ibm.com>
Date: Tue, 28 Jun 2005 15:58:50 -0700
Subject: [NET]: Add missing include to linux/netdevice.h

linux/etherdevice.h can't be included standalone at the moment, which
is required in order to sort the header files in the recommended
alphabetic order. This patch fixes that and is needed to build spider_net.

Signed-off-by: Arnd Bergmann <arndb@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/etherdevice.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 8a2df4dfbc59..cf3847edc50f 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -25,6 +25,7 @@
 #define _LINUX_ETHERDEVICE_H
 
 #include <linux/if_ether.h>
+#include <linux/netdevice.h>
 #include <linux/random.h>
 
 #ifdef __KERNEL__
-- 
cgit v1.2.3-59-g8ed1b


From 05133fc498e788e1c1ca4e906f9e05d9779fd63b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 28 Jun 2005 20:44:54 -0700
Subject: [PATCH] swabb.h warning fixes

In file included from drivers/media/dvb/ttpci/av7110_hw.c:38:
include/linux/byteorder/swabb.h:96: warning: type qualifiers ignored on function return type
include/linux/byteorder/swabb.h:110: warning: type qualifiers ignored on function return type
In file included from drivers/media/dvb/ttpci/av7110_v4l.c:36:
include/linux/byteorder/swabb.h:96: warning: type qualifiers ignored on function return type
include/linux/byteorder/swabb.h:110: warning: type qualifiers ignored on function return type
In file included from drivers/media/dvb/ttpci/av7110_av.c:37:
include/linux/byteorder/swabb.h:96: warning: type qualifiers ignored on function return type
include/linux/byteorder/swabb.h:110: warning: type qualifiers ignored on function return type
drivers/isdn/icn/icn.c:719:4: warning: #warning TODO test headroom or use skb->nb to flag ACK
In file included from drivers/media/dvb/ttpci/av7110_ca.c:39:
include/linux/byteorder/swabb.h:96: warning: type qualifiers ignored on function return type
include/linux/byteorder/swabb.h:110: warning: type qualifiers ignored on function return type
In file included from drivers/media/dvb/ttpci/av7110.c:41:
include/linux/byteorder/swabb.h:96: warning: type qualifiers ignored on function return type
include/linux/byteorder/swabb.h:110: warning: type qualifiers ignored on function return type

Does declaring a function to return a const value actually mean something to
gcc?

Dunno.  Kill it and replace sone `__inline__'s with `inline' too.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/byteorder/swabb.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/byteorder/swabb.h b/include/linux/byteorder/swabb.h
index d28d9a804d3b..d5f2a3205109 100644
--- a/include/linux/byteorder/swabb.h
+++ b/include/linux/byteorder/swabb.h
@@ -92,29 +92,32 @@
 #endif /* OPTIMIZE */
 
 
-static __inline__ __const__ __u32 __fswahw32(__u32 x)
+static inline __u32 __fswahw32(__u32 x)
 {
 	return __arch__swahw32(x);
 }
-static __inline__ __u32 __swahw32p(__u32 *x)
+
+static inline __u32 __swahw32p(__u32 *x)
 {
 	return __arch__swahw32p(x);
 }
-static __inline__ void __swahw32s(__u32 *addr)
+
+static inline void __swahw32s(__u32 *addr)
 {
 	__arch__swahw32s(addr);
 }
 
-
-static __inline__ __const__ __u32 __fswahb32(__u32 x)
+static inline __u32 __fswahb32(__u32 x)
 {
 	return __arch__swahb32(x);
 }
-static __inline__ __u32 __swahb32p(__u32 *x)
+
+static inline __u32 __swahb32p(__u32 *x)
 {
 	return __arch__swahb32p(x);
 }
-static __inline__ void __swahb32s(__u32 *addr)
+
+static inline void __swahb32s(__u32 *addr)
 {
 	__arch__swahb32s(addr);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 687a21cee17000177b1935896b9b475acf136678 Mon Sep 17 00:00:00 2001
From: Pekka J Enberg <penberg@cs.Helsinki.FI>
Date: Tue, 28 Jun 2005 20:44:55 -0700
Subject: [PATCH] rename wakeup_bdflush to wakeup_pdflush

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c               | 4 ++--
 include/linux/writeback.h | 2 +-
 mm/page-writeback.c       | 2 +-
 mm/vmscan.c               | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 13e5938a64f6..561e63a14966 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -278,7 +278,7 @@ EXPORT_SYMBOL(thaw_bdev);
  */
 static void do_sync(unsigned long wait)
 {
-	wakeup_bdflush(0);
+	wakeup_pdflush(0);
 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
 	DQUOT_SYNC(NULL);
 	sync_supers();		/* Write the superblocks */
@@ -497,7 +497,7 @@ static void free_more_memory(void)
 	struct zone **zones;
 	pg_data_t *pgdat;
 
-	wakeup_bdflush(1024);
+	wakeup_pdflush(1024);
 	yield();
 
 	for_each_pgdat(pgdat) {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d5c3fe1bf33d..542dbaee6512 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -85,7 +85,7 @@ static inline void wait_on_inode(struct inode *inode)
 /*
  * mm/page-writeback.c
  */
-int wakeup_bdflush(long nr_pages);
+int wakeup_pdflush(long nr_pages);
 void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(void);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 613b99a55917..a6329fa8f862 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -354,7 +354,7 @@ static void background_writeout(unsigned long _min_pages)
  * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
  * -1 if all pdflush threads were busy.
  */
-int wakeup_bdflush(long nr_pages)
+int wakeup_pdflush(long nr_pages)
 {
 	if (nr_pages == 0) {
 		struct writeback_state wbs;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1fa312a8db77..cfffe5098d53 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -972,7 +972,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
 		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
-			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From 334a13ec3d01a1a4b4f2249735b793105cb4a519 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Jun 2005 20:44:58 -0700
Subject: [PATCH] really remove xattr_acl.h

Looks like it sneaked back with the NFS ACL merge..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/nfs3acl.c          | 14 ++++++-------
 fs/nfsd/vfs.c             | 13 ++++++------
 include/linux/xattr_acl.h | 50 -----------------------------------------------
 3 files changed, 13 insertions(+), 64 deletions(-)
 delete mode 100644 include/linux/xattr_acl.h

(limited to 'include/linux')

diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index ee3536fc84a3..1b7a3ef2f813 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -2,7 +2,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs3.h>
 #include <linux/nfs_fs.h>
-#include <linux/xattr_acl.h>
+#include <linux/posix_acl_xattr.h>
 #include <linux/nfsacl.h>
 
 #define NFSDBG_FACILITY	NFSDBG_PROC
@@ -53,9 +53,9 @@ ssize_t nfs3_getxattr(struct dentry *dentry, const char *name,
 	struct posix_acl *acl;
 	int type, error = 0;
 
-	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
 		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
 		type = ACL_TYPE_DEFAULT;
 	else
 		return -EOPNOTSUPP;
@@ -82,9 +82,9 @@ int nfs3_setxattr(struct dentry *dentry, const char *name,
 	struct posix_acl *acl;
 	int type, error;
 
-	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
 		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
 		type = ACL_TYPE_DEFAULT;
 	else
 		return -EOPNOTSUPP;
@@ -103,9 +103,9 @@ int nfs3_removexattr(struct dentry *dentry, const char *name)
 	struct inode *inode = dentry->d_inode;
 	int type;
 
-	if (strcmp(name, XATTR_NAME_ACL_ACCESS) == 0)
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0)
 		type = ACL_TYPE_ACCESS;
-	else if (strcmp(name, XATTR_NAME_ACL_DEFAULT) == 0)
+	else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0)
 		type = ACL_TYPE_DEFAULT;
 	else
 		return -EOPNOTSUPP;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index de340ffd33c3..be24ead89d94 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -46,10 +46,9 @@
 #include <linux/nfsd/nfsfh.h>
 #include <linux/quotaops.h>
 #include <linux/dnotify.h>
-#include <linux/xattr_acl.h>
 #include <linux/posix_acl.h>
-#ifdef CONFIG_NFSD_V4
 #include <linux/posix_acl_xattr.h>
+#ifdef CONFIG_NFSD_V4
 #include <linux/xattr.h>
 #include <linux/nfs4.h>
 #include <linux/nfs4_acl.h>
@@ -1872,10 +1871,10 @@ nfsd_get_posix_acl(struct svc_fh *fhp, int type)
 		return ERR_PTR(-EOPNOTSUPP);
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			name = XATTR_NAME_ACL_ACCESS;
+			name = POSIX_ACL_XATTR_ACCESS;
 			break;
 		case ACL_TYPE_DEFAULT:
-			name = XATTR_NAME_ACL_DEFAULT;
+			name = POSIX_ACL_XATTR_DEFAULT;
 			break;
 		default:
 			return ERR_PTR(-EOPNOTSUPP);
@@ -1919,17 +1918,17 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
 		return -EOPNOTSUPP;
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			name = XATTR_NAME_ACL_ACCESS;
+			name = POSIX_ACL_XATTR_ACCESS;
 			break;
 		case ACL_TYPE_DEFAULT:
-			name = XATTR_NAME_ACL_DEFAULT;
+			name = POSIX_ACL_XATTR_DEFAULT;
 			break;
 		default:
 			return -EOPNOTSUPP;
 	}
 
 	if (acl && acl->a_count) {
-		size = xattr_acl_size(acl->a_count);
+		size = posix_acl_xattr_size(acl->a_count);
 		value = kmalloc(size, GFP_KERNEL);
 		if (!value)
 			return -ENOMEM;
diff --git a/include/linux/xattr_acl.h b/include/linux/xattr_acl.h
deleted file mode 100644
index 7a1f9b93a45f..000000000000
--- a/include/linux/xattr_acl.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
-  File: linux/xattr_acl.h
-
-  (extended attribute representation of access control lists)
-
-  (C) 2000 Andreas Gruenbacher, <a.gruenbacher@computer.org>
-*/
-
-#ifndef _LINUX_XATTR_ACL_H
-#define _LINUX_XATTR_ACL_H
-
-#include <linux/posix_acl.h>
-
-#define XATTR_NAME_ACL_ACCESS	"system.posix_acl_access"
-#define XATTR_NAME_ACL_DEFAULT	"system.posix_acl_default"
-
-#define XATTR_ACL_VERSION	0x0002
-
-typedef struct {
-	__u16		e_tag;
-	__u16		e_perm;
-	__u32		e_id;
-} xattr_acl_entry;
-
-typedef struct {
-	__u32		a_version;
-	xattr_acl_entry	a_entries[0];
-} xattr_acl_header;
-
-static inline size_t xattr_acl_size(int count)
-{
-	return sizeof(xattr_acl_header) + count * sizeof(xattr_acl_entry);
-}
-
-static inline int xattr_acl_count(size_t size)
-{
-	if (size < sizeof(xattr_acl_header))
-		return -1;
-	size -= sizeof(xattr_acl_header);
-	if (size % sizeof(xattr_acl_entry))
-		return -1;
-	return size / sizeof(xattr_acl_entry);
-}
-
-struct posix_acl * posix_acl_from_xattr(const void *value, size_t size);
-int posix_acl_to_xattr(const struct posix_acl *acl, void *buffer, size_t size);
-
-
-
-#endif /* _LINUX_XATTR_ACL_H */
-- 
cgit v1.2.3-59-g8ed1b


From 3607d1dfc80dcfbd3a6f236c70aa0d8eb7292278 Mon Sep 17 00:00:00 2001
From: GOTO Masanori <gotom@debian.or.jp>
Date: Tue, 28 Jun 2005 20:45:04 -0700
Subject: [PATCH] headers: include linux/compiler.h for __user

This patch lets i2c-dev.h include linux/compiler.h so that __user is defined.

Signed-off-by: GOTO Masanori <gotom@debian.or.jp>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/i2c-dev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/i2c-dev.h b/include/linux/i2c-dev.h
index d228230ffe5d..541695679762 100644
--- a/include/linux/i2c-dev.h
+++ b/include/linux/i2c-dev.h
@@ -25,6 +25,7 @@
 #define _LINUX_I2C_DEV_H
 
 #include <linux/types.h>
+#include <linux/compiler.h>
 
 /* Some IOCTL commands are defined in <linux/i2c.h> */
 /* Note: 10-bit addresses are NOT supported! */
-- 
cgit v1.2.3-59-g8ed1b


From 4cceb4d13abaedbd52e54053367c793ed4aedb6b Mon Sep 17 00:00:00 2001
From: GOTO Masanori <gotom@debian.or.jp>
Date: Tue, 28 Jun 2005 20:45:05 -0700
Subject: [PATCH] headers: include linux/types.h for usb_ch9.h

This patch for usb_ch9.h includes linux/types.h instead of asm/types.h so that
__le16 and so on is explicitly defined.  It also cleans up non standard //
comment.

Signed-off-by: GOTO Masanori <gotom@debian.or.jp>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/usb_ch9.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb_ch9.h b/include/linux/usb_ch9.h
index 39e7ff4ffd28..ee21e6bf3867 100644
--- a/include/linux/usb_ch9.h
+++ b/include/linux/usb_ch9.h
@@ -19,7 +19,7 @@
 #ifndef __LINUX_USB_CH9_H
 #define __LINUX_USB_CH9_H
 
-#include <asm/types.h>		/* __u8 etc */
+#include <linux/types.h>	/* __u8 etc */
 
 /*-------------------------------------------------------------------------*/
 
@@ -294,8 +294,8 @@ struct usb_endpoint_descriptor {
 	__le16 wMaxPacketSize;
 	__u8  bInterval;
 
-	// NOTE:  these two are _only_ in audio endpoints.
-	// use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof.
+	/* NOTE:  these two are _only_ in audio endpoints. */
+	/* use USB_DT_ENDPOINT*_SIZE in bLength, not sizeof. */
 	__u8  bRefresh;
 	__u8  bSynchAddress;
 } __attribute__ ((packed));
-- 
cgit v1.2.3-59-g8ed1b


From fb3cc4320e1fd87143683b540e459a2e20fdc9bb Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 28 Jun 2005 20:45:15 -0700
Subject: [PATCH] blk: light iocontext ops

get_io_context needlessly turned off interrupts and checked for racing io
context creations.  Both of which aren't needed, because the io context can
only be created while in process context of the current process.

Also, split the function in 2.  A light version, current_io_context does not
elevate the reference count specifically, but can be used when in process
context, because the process holds a reference itself.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 56 +++++++++++++++++++++--------------------------
 include/linux/blkdev.h    |  1 +
 2 files changed, 26 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 5caebe2cf0a1..1197462bb6ba 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1876,7 +1876,7 @@ static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = get_io_context(GFP_ATOMIC);
+	struct io_context *ioc = current_io_context(GFP_ATOMIC);
 
 	if (unlikely(test_bit(QUEUE_FLAG_DRAIN, &q->queue_flags)))
 		goto out;
@@ -1959,7 +1959,6 @@ rq_starved:
 	rq_init(q, rq);
 	rq->rl = rl;
 out:
-	put_io_context(ioc);
 	return rq;
 }
 
@@ -1997,9 +1996,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 			 * up to a big batch of them for a small period time.
 			 * See ioc_batching, ioc_set_batching
 			 */
-			ioc = get_io_context(GFP_NOIO);
+			ioc = current_io_context(GFP_NOIO);
 			ioc_set_batching(q, ioc);
-			put_io_context(ioc);
 
 			spin_lock_irq(q->queue_lock);
 		}
@@ -3282,24 +3280,20 @@ void exit_io_context(void)
 
 /*
  * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+ * Otherwise, return its existing IO context.
  *
- * This is always called in the context of the task which submitted the I/O.
- * But weird things happen, so we disable local interrupts to ensure exclusive
- * access to *current.
+ * This returned IO context doesn't have a specifically elevated refcount,
+ * but since the current task itself holds a reference, the context can be
+ * used in general code, so long as it stays within `current` context.
  */
-struct io_context *get_io_context(int gfp_flags)
+struct io_context *current_io_context(int gfp_flags)
 {
 	struct task_struct *tsk = current;
-	unsigned long flags;
 	struct io_context *ret;
 
-	local_irq_save(flags);
 	ret = tsk->io_context;
-	if (ret)
-		goto out;
-
-	local_irq_restore(flags);
+	if (likely(ret))
+		return ret;
 
 	ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
 	if (ret) {
@@ -3310,25 +3304,25 @@ struct io_context *get_io_context(int gfp_flags)
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic = NULL;
+		tsk->io_context = ret;
+	}
 
-		local_irq_save(flags);
-
-		/*
-		 * very unlikely, someone raced with us in setting up the task
-		 * io context. free new context and just grab a reference.
-		 */
-		if (!tsk->io_context)
-			tsk->io_context = ret;
-		else {
-			kmem_cache_free(iocontext_cachep, ret);
-			ret = tsk->io_context;
-		}
+	return ret;
+}
+EXPORT_SYMBOL(current_io_context);
 
-out:
+/*
+ * If the current task has no IO context then create one and initialise it.
+ * If it does have a context, take a ref on it.
+ *
+ * This is always called in the context of the task which submitted the I/O.
+ */
+struct io_context *get_io_context(int gfp_flags)
+{
+	struct io_context *ret;
+	ret = current_io_context(gfp_flags);
+	if (likely(ret))
 		atomic_inc(&ret->refcount);
-		local_irq_restore(flags);
-	}
-
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 21a8674cd149..0881b5cdee3d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -96,6 +96,7 @@ struct io_context {
 
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
+struct io_context *current_io_context(int gfp_flags);
 struct io_context *get_io_context(int gfp_flags);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
-- 
cgit v1.2.3-59-g8ed1b


From 200803dfe4ff772740d63db725ab2f1b185ccf92 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Tue, 28 Jun 2005 20:45:18 -0700
Subject: [PATCH] irqpoll

Anyone reporting a stuck IRQ should try these options.  Its effectiveness
varies we've found in the Fedora case.  Quite a few systems with misdescribed
IRQ routing just work when you use irqpoll.  It also fixes up the VIA systems
although thats now fixed with the VIA quirk (which we could just make default
as its what Redmond OS does but Linus didn't like it historically).

A small number of systems have jammed IRQ sources or misdescribes that cause
an IRQ that we have no handler registered anywhere for.  In those cases it
doesn't help.

Signed-off-by: Alan Cox <number6@the-village.bc.nu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  15 +++++
 arch/ppc64/kernel/irq.c             |   2 +-
 include/linux/irq.h                 |   5 +-
 kernel/irq/handle.c                 |   2 +-
 kernel/irq/spurious.c               | 113 +++++++++++++++++++++++++++++++++++-
 5 files changed, 131 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 0f71251f12b9..67e99f144199 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -437,6 +437,10 @@ running once the system is up.
 			Format: {"of[f]" | "sk[ipmbr]"}
 			See comment in arch/i386/boot/edd.S
 
+	edd		[EDD]
+			Format: {"of[f]" | "sk[ipmbr]"}
+			See comment in arch/i386/boot/edd.S
+
 	eicon=		[HW,ISDN] 
 			Format: <id>,<membase>,<irq>
 
@@ -622,6 +626,17 @@ running once the system is up.
 	ips=		[HW,SCSI] Adaptec / IBM ServeRAID controller
 			See header of drivers/scsi/ips.c.
 
+	irqfixup	[HW]
+			When an interrupt is not handled search all handlers
+			for it. Intended to get systems with badly broken
+			firmware running.
+
+	irqpoll		[HW]
+			When an interrupt is not handled search all handlers
+			for it. Also check all handlers each timer
+			interrupt. Intended to get systems with badly broken
+			firmware running.
+
 	isapnp=		[ISAPNP]
 			Format: <RDP>, <reset>, <pci_scan>, <verbosity>
 
diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c
index 3defc8c33adf..ffe300611f00 100644
--- a/arch/ppc64/kernel/irq.c
+++ b/arch/ppc64/kernel/irq.c
@@ -245,7 +245,7 @@ void ppc_irq_dispatch_handler(struct pt_regs *regs, int irq)
 
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
+			note_interrupt(irq, desc, action_ret, regs);
 		if (likely(!(desc->status & IRQ_PENDING)))
 			break;
 		desc->status &= ~IRQ_PENDING;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 12277799c007..069d3b84d311 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -85,9 +85,10 @@ extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
 extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-				       struct irqaction *action);
+					struct irqaction *action);
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
-extern void note_interrupt(unsigned int irq, irq_desc_t *desc, int action_ret);
+extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
+					int action_ret, struct pt_regs *regs);
 extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
 extern void init_irq_proc(void);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 436c7d93c00a..c29f83c16497 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -172,7 +172,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
-			note_interrupt(irq, desc, action_ret);
+			note_interrupt(irq, desc, action_ret, regs);
 		if (likely(!(desc->status & IRQ_PENDING)))
 			break;
 		desc->status &= ~IRQ_PENDING;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index ba039e827d58..7df9abd5ec86 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,6 +11,83 @@
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 
+static int irqfixup;
+
+/*
+ * Recovery handler for misrouted interrupts.
+ */
+
+static int misrouted_irq(int irq, struct pt_regs *regs)
+{
+	int i;
+	irq_desc_t *desc;
+	int ok = 0;
+	int work = 0;	/* Did we do work for a real IRQ */
+
+	for(i = 1; i < NR_IRQS; i++) {
+		struct irqaction *action;
+
+		if (i == irq)	/* Already tried */
+			continue;
+		desc = &irq_desc[i];
+		spin_lock(&desc->lock);
+		action = desc->action;
+		/* Already running on another processor */
+		if (desc->status & IRQ_INPROGRESS) {
+			/*
+			 * Already running: If it is shared get the other
+			 * CPU to go looking for our mystery interrupt too
+			 */
+			if (desc->action && (desc->action->flags & SA_SHIRQ))
+				desc->status |= IRQ_PENDING;
+			spin_unlock(&desc->lock);
+			continue;
+		}
+		/* Honour the normal IRQ locking */
+		desc->status |= IRQ_INPROGRESS;
+		spin_unlock(&desc->lock);
+		while (action) {
+			/* Only shared IRQ handlers are safe to call */
+			if (action->flags & SA_SHIRQ) {
+				if (action->handler(i, action->dev_id, regs) ==
+						IRQ_HANDLED)
+					ok = 1;
+			}
+			action = action->next;
+		}
+		local_irq_disable();
+		/* Now clean up the flags */
+		spin_lock(&desc->lock);
+		action = desc->action;
+
+		/*
+		 * While we were looking for a fixup someone queued a real
+		 * IRQ clashing with our walk
+		 */
+
+		while ((desc->status & IRQ_PENDING) && action) {
+			/*
+			 * Perform real IRQ processing for the IRQ we deferred
+			 */
+			work = 1;
+			spin_unlock(&desc->lock);
+			handle_IRQ_event(i, regs, action);
+			spin_lock(&desc->lock);
+			desc->status &= ~IRQ_PENDING;
+		}
+		desc->status &= ~IRQ_INPROGRESS;
+		/*
+		 * If we did actual work for the real IRQ line we must let the
+		 * IRQ controller clean up too
+		 */
+		if(work)
+			desc->handler->end(i);
+		spin_unlock(&desc->lock);
+	}
+	/* So the caller can adjust the irq error counts */
+	return ok;
+}
+
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -31,7 +108,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 		printk(KERN_ERR "irq event %d: bogus return value %x\n",
 				irq, action_ret);
 	} else {
-		printk(KERN_ERR "irq %d: nobody cared!\n", irq);
+		printk(KERN_ERR "irq %d: nobody cared (try booting with "
+				"the \"irqpoll\" option)\n", irq);
 	}
 	dump_stack();
 	printk(KERN_ERR "handlers:\n");
@@ -55,7 +133,8 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
 	}
 }
 
-void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
+			struct pt_regs *regs)
 {
 	if (action_ret != IRQ_HANDLED) {
 		desc->irqs_unhandled++;
@@ -63,6 +142,15 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
 			report_bad_irq(irq, desc, action_ret);
 	}
 
+	if (unlikely(irqfixup)) {
+		/* Don't punish working computers */
+		if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+			int ok = misrouted_irq(irq, regs);
+			if (action_ret == IRQ_NONE)
+				desc->irqs_unhandled -= ok;
+		}
+	}
+
 	desc->irq_count++;
 	if (desc->irq_count < 100000)
 		return;
@@ -94,3 +182,24 @@ int __init noirqdebug_setup(char *str)
 
 __setup("noirqdebug", noirqdebug_setup);
 
+static int __init irqfixup_setup(char *str)
+{
+	irqfixup = 1;
+	printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
+	printk(KERN_WARNING "This may impact system performance.\n");
+	return 1;
+}
+
+__setup("irqfixup", irqfixup_setup);
+
+static int __init irqpoll_setup(char *str)
+{
+	irqfixup = 2;
+	printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
+				"enabled\n");
+	printk(KERN_WARNING "This may significantly impact system "
+				"performance\n");
+	return 1;
+}
+
+__setup("irqpoll", irqpoll_setup);
-- 
cgit v1.2.3-59-g8ed1b


From 115d6f3fd25991f2a7de1ff4d758086209b1ed12 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
Date: Tue, 28 Jun 2005 20:45:27 -0700
Subject: [PATCH] V4L: API new webcam formats included

Add Philips Webcam format.

Signed-off-by: Mauro Carvalho Chehab <mchehab@brturbo.com.br>
Signed-off-by: Luc Saillard <luc@saillard.org>.
Signed-off-by: Nickolay V Shmyrev <nshmyrev@yandex.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/videodev2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 4e0edce53760..acbfc525576d 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -221,6 +221,8 @@ struct v4l2_pix_format
 /*  Vendor-specific formats   */
 #define V4L2_PIX_FMT_WNVA     v4l2_fourcc('W','N','V','A') /* Winnov hw compress */
 #define V4L2_PIX_FMT_SN9C10X  v4l2_fourcc('S','9','1','0') /* SN9C10x compression */
+#define V4L2_PIX_FMT_PWC1     v4l2_fourcc('P','W','C','1') /* pwc older webcam */
+#define V4L2_PIX_FMT_PWC2     v4l2_fourcc('P','W','C','2') /* pwc newer webcam */
 
 /*
  *	F O R M A T   E N U M E R A T I O N
-- 
cgit v1.2.3-59-g8ed1b


From 0dfc62465ef92c7ddcb1ba223bf062453566fd0f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 31 May 2005 20:39:20 +0100
Subject: [MTD] NAND: Reorganize chip locking

The code was wrong in several aspects. The locking order was
inconsistent, the device aquire code did not reset a variable
after a wakeup and the wakeup handling was not working for
applications where multiple chips are sharing a single
hardware controller.
When a hardware controller is available the locking is now
reduced to the hardware controller lock and the waitqueue is
moved to the hardware controller structure in order to avoid
a wake_up_all().

The problem was pointed out by Ben Dooks, who also found the
missing variable reset as main cause for his deadlock problem.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/nand/nand_base.c | 57 ++++++++++++++++++++++----------------------
 include/linux/mtd/nand.h     |  5 +++-
 2 files changed, 33 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index f1db0bf9306b..bbe0283433d2 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -59,7 +59,7 @@
  *	The AG-AND chips have nice features for speed improvement,
  *	which are not supported yet. Read / program 4 pages in one go.
  *
- * $Id: nand_base.c,v 1.143 2005/05/19 16:10:22 gleixner Exp $
+ * $Id: nand_base.c,v 1.145 2005/05/31 20:32:53 gleixner Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -167,17 +167,21 @@ static void nand_release_device (struct mtd_info *mtd)
 
 	/* De-select the NAND device */
 	this->select_chip(mtd, -1);
-	/* Do we have a hardware controller ? */
+
 	if (this->controller) {
+		/* Release the controller and the chip */
 		spin_lock(&this->controller->lock);
 		this->controller->active = NULL;
+		this->state = FL_READY;
+		wake_up(&this->controller->wq);
 		spin_unlock(&this->controller->lock);
+	} else {
+		/* Release the chip */
+		spin_lock(&this->chip_lock);
+		this->state = FL_READY;
+		wake_up(&this->wq);
+		spin_unlock(&this->chip_lock);
 	}
-	/* Release the chip */
-	spin_lock (&this->chip_lock);
-	this->state = FL_READY;
-	wake_up (&this->wq);
-	spin_unlock (&this->chip_lock);
 }
 
 /**
@@ -753,37 +757,34 @@ static void nand_command_lp (struct mtd_info *mtd, unsigned command, int column,
  */
 static void nand_get_device (struct nand_chip *this, struct mtd_info *mtd, int new_state)
 {
-	struct nand_chip *active = this;
-
+	struct nand_chip *active;
+	spinlock_t *lock;
+	wait_queue_head_t *wq;
 	DECLARE_WAITQUEUE (wait, current);
 
-	/* 
-	 * Grab the lock and see if the device is available 
-	*/
+	lock = (this->controller) ? &this->controller->lock : &this->chip_lock;
+	wq = (this->controller) ? &this->controller->wq : &this->wq;
 retry:
+	active = this;
+	spin_lock(lock);
+
 	/* Hardware controller shared among independend devices */
 	if (this->controller) {
-		spin_lock (&this->controller->lock);
 		if (this->controller->active)
 			active = this->controller->active;
 		else
 			this->controller->active = this;
-		spin_unlock (&this->controller->lock);
 	}
-	
-	if (active == this) {
-		spin_lock (&this->chip_lock);
-		if (this->state == FL_READY) {
-			this->state = new_state;
-			spin_unlock (&this->chip_lock);
-			return;
-		}
-	}	
-	set_current_state (TASK_UNINTERRUPTIBLE);
-	add_wait_queue (&active->wq, &wait);
-	spin_unlock (&active->chip_lock);
-	schedule ();
-	remove_wait_queue (&active->wq, &wait);
+	if (active == this && this->state == FL_READY) {
+		this->state = new_state;
+		spin_unlock(lock);
+		return;
+	}
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(wq, &wait);
+	spin_unlock(lock);
+	schedule();
+	remove_wait_queue(wq, &wait);
 	goto retry;
 }
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index bee78969cb21..9b5b76217584 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -5,7 +5,7 @@
  *                     Steven J. Hill <sjhill@realitydiluted.com>
  *		       Thomas Gleixner <tglx@linutronix.de>
  *
- * $Id: nand.h,v 1.71 2005/02/09 12:12:59 gleixner Exp $
+ * $Id: nand.h,v 1.73 2005/05/31 19:39:17 gleixner Exp $
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -253,10 +253,13 @@ struct nand_chip;
  * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independend devices
  * @lock:               protection lock  
  * @active:		the mtd device which holds the controller currently
+ * @wq:			wait queue to sleep on if a NAND operation is in progress
+ *                      used instead of the per chip wait queue when a hw controller is available
  */
 struct nand_hw_control {
 	spinlock_t	 lock;
 	struct nand_chip *active;
+	wait_queue_head_t wq;
 };
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From 02b15e343aeefb49f8cac949be599d78250a568f Mon Sep 17 00:00:00 2001
From: Todd Poynor <tpoynor@mvista.com>
Date: Tue, 7 Jun 2005 00:04:39 +0100
Subject: [MTD] XIP for AMD CFI flash.

Author: Vitaly Wool <vwool@ru.mvista.com>
Signed-off-by: Todd Poynor <tpoynor@mvista.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/chips/Kconfig           |   4 +-
 drivers/mtd/chips/cfi_cmdset_0002.c | 402 ++++++++++++++++++++++++++++--------
 drivers/mtd/chips/fwh_lock.h        |   6 +-
 drivers/mtd/maps/map_funcs.c        |  11 +-
 include/linux/mtd/cfi.h             |  12 +-
 5 files changed, 324 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/Kconfig b/drivers/mtd/chips/Kconfig
index f4eda1e40d51..b5dc59389bb3 100644
--- a/drivers/mtd/chips/Kconfig
+++ b/drivers/mtd/chips/Kconfig
@@ -1,5 +1,5 @@
 # drivers/mtd/chips/Kconfig
-# $Id: Kconfig,v 1.14 2005/02/08 17:11:15 nico Exp $
+# $Id: Kconfig,v 1.15 2005/06/06 23:04:35 tpoynor Exp $
 
 menu "RAM/ROM/Flash chip drivers"
 	depends on MTD!=n
@@ -300,7 +300,7 @@ config MTD_JEDEC
 
 config MTD_XIP
 	bool "XIP aware MTD support"
-	depends on !SMP && MTD_CFI_INTELEXT && EXPERIMENTAL
+	depends on !SMP && (MTD_CFI_INTELEXT || MTD_CFI_AMDSTD) && EXPERIMENTAL
 	default y if XIP_KERNEL
 	help
 	  This allows MTD support to work with flash memory which is also
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index 49cd81207137..e42eefbda0e1 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -4,16 +4,20 @@
  *
  * Copyright (C) 2000 Crossnet Co. <info@crossnet.co.jp>
  * Copyright (C) 2004 Arcom Control Systems Ltd <linux@arcom.com>
+ * Copyright (C) 2005 MontaVista Software Inc. <source@mvista.com>
  *
  * 2_by_8 routines added by Simon Munton
  *
  * 4_by_16 work by Carolyn J. Smith
  *
+ * XIP support hooks by Vitaly Wool (based on code for Intel flash 
+ * by Nicolas Pitre)
+ * 
  * Occasionally maintained by Thayne Harbaugh tharbaugh at lnxi dot com
  *
  * This code is GPL
  *
- * $Id: cfi_cmdset_0002.c,v 1.116 2005/05/24 13:29:42 gleixner Exp $
+ * $Id: cfi_cmdset_0002.c,v 1.117 2005/06/06 23:04:35 tpoynor Exp $
  *
  */
 
@@ -34,6 +38,7 @@
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/cfi.h>
+#include <linux/mtd/xip.h>
 
 #define AMD_BOOTLOC_BUG
 #define FORCE_WORD_WRITE 0
@@ -393,7 +398,7 @@ static struct mtd_info *cfi_amdstd_setup(struct mtd_info *mtd)
  * correctly and is therefore not done	(particulary with interleaved chips
  * as each chip must be checked independantly of the others).
  */
-static int chip_ready(struct map_info *map, unsigned long addr)
+static int __xipram chip_ready(struct map_info *map, unsigned long addr)
 {
 	map_word d, t;
 
@@ -418,7 +423,7 @@ static int chip_ready(struct map_info *map, unsigned long addr)
  * as each chip must be checked independantly of the others).
  *
  */
-static int chip_good(struct map_info *map, unsigned long addr, map_word expected)
+static int __xipram chip_good(struct map_info *map, unsigned long addr, map_word expected)
 {
 	map_word oldd, curd;
 
@@ -448,12 +453,12 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 
 			if (time_after(jiffies, timeo)) {
 				printk(KERN_ERR "Waiting for chip to be ready timed out.\n");
-				cfi_spin_unlock(chip->mutex);
+				spin_unlock(chip->mutex);
 				return -EIO;
 			}
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			cfi_udelay(1);
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			/* Someone else might have been playing with it. */
 			goto retry;
 		}
@@ -501,15 +506,23 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 				return -EIO;
 			}
 			
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			cfi_udelay(1);
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			/* Nobody will touch it while it's in state FL_ERASE_SUSPENDING.
 			   So we can just loop here. */
 		}
 		chip->state = FL_READY;
 		return 0;
 
+	case FL_XIP_WHILE_ERASING:
+		if (mode != FL_READY && mode != FL_POINT &&
+		    (!cfip || !(cfip->EraseSuspend&2)))
+			goto sleep;
+		chip->oldstate = chip->state;
+		chip->state = FL_READY;
+		return 0;
+
 	case FL_POINT:
 		/* Only if there's no operation suspended... */
 		if (mode == FL_READY && chip->oldstate == FL_READY)
@@ -519,10 +532,10 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr
 	sleep:
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&chip->wq, &wait);
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		schedule();
 		remove_wait_queue(&chip->wq, &wait);
-		cfi_spin_lock(chip->mutex);
+		spin_lock(chip->mutex);
 		goto resettime;
 	}
 }
@@ -540,6 +553,11 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
 		chip->state = FL_ERASING;
 		break;
 
+	case FL_XIP_WHILE_ERASING:
+		chip->state = chip->oldstate;
+		chip->oldstate = FL_READY;
+		break;
+
 	case FL_READY:
 	case FL_STATUS:
 		/* We should really make set_vpp() count, rather than doing this */
@@ -551,6 +569,198 @@ static void put_chip(struct map_info *map, struct flchip *chip, unsigned long ad
 	wake_up(&chip->wq);
 }
 
+#ifdef CONFIG_MTD_XIP
+
+/*
+ * No interrupt what so ever can be serviced while the flash isn't in array
+ * mode.  This is ensured by the xip_disable() and xip_enable() functions
+ * enclosing any code path where the flash is known not to be in array mode.
+ * And within a XIP disabled code path, only functions marked with __xipram
+ * may be called and nothing else (it's a good thing to inspect generated
+ * assembly to make sure inline functions were actually inlined and that gcc
+ * didn't emit calls to its own support functions). Also configuring MTD CFI
+ * support to a single buswidth and a single interleave is also recommended.
+ */
+#include <asm/hardware.h>
+static void xip_disable(struct map_info *map, struct flchip *chip,
+			unsigned long adr)
+{
+	/* TODO: chips with no XIP use should ignore and return */
+	(void) map_read(map, adr); /* ensure mmu mapping is up to date */
+	local_irq_disable();
+}
+
+static void __xipram xip_enable(struct map_info *map, struct flchip *chip,
+				unsigned long adr)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+
+	if (chip->state != FL_POINT && chip->state != FL_READY) {
+		map_write(map, CMD(0xf0), adr);
+		chip->state = FL_READY;
+	}
+	(void) map_read(map, adr);
+	asm volatile (".rep 8; nop; .endr"); /* fill instruction prefetch */
+	local_irq_enable();
+}
+
+/*
+ * When a delay is required for the flash operation to complete, the
+ * xip_udelay() function is polling for both the given timeout and pending
+ * (but still masked) hardware interrupts.  Whenever there is an interrupt
+ * pending then the flash erase operation is suspended, array mode restored 
+ * and interrupts unmasked.  Task scheduling might also happen at that
+ * point.  The CPU eventually returns from the interrupt or the call to
+ * schedule() and the suspended flash operation is resumed for the remaining
+ * of the delay period.
+ *
+ * Warning: this function _will_ fool interrupt latency tracing tools.
+ */
+
+static void __xipram xip_udelay(struct map_info *map, struct flchip *chip,
+				unsigned long adr, int usec)
+{
+	struct cfi_private *cfi = map->fldrv_priv;
+	struct cfi_pri_amdstd *extp = cfi->cmdset_priv;
+	map_word status, OK = CMD(0x80);
+	unsigned long suspended, start = xip_currtime();
+	flstate_t oldstate;
+
+	do {
+		cpu_relax();
+		if (xip_irqpending() && extp &&
+		    ((chip->state == FL_ERASING && (extp->EraseSuspend & 2))) &&
+		    (cfi_interleave_is_1(cfi) || chip->oldstate == FL_READY)) {
+			/*
+			 * Let's suspend the erase operation when supported.  
+			 * Note that we currently don't try to suspend 
+			 * interleaved chips if there is already another 
+			 * operation suspended (imagine what happens
+			 * when one chip was already done with the current
+			 * operation while another chip suspended it, then
+			 * we resume the whole thing at once).  Yes, it
+			 * can happen!
+			 */
+			map_write(map, CMD(0xb0), adr);
+			usec -= xip_elapsed_since(start);
+			suspended = xip_currtime();
+			do {
+				if (xip_elapsed_since(suspended) > 100000) {
+					/*
+					 * The chip doesn't want to suspend
+					 * after waiting for 100 msecs.
+					 * This is a critical error but there
+					 * is not much we can do here.
+					 */
+					return;
+				}
+				status = map_read(map, adr);
+			} while (!map_word_andequal(map, status, OK, OK));
+
+			/* Suspend succeeded */
+			oldstate = chip->state;
+			if (!map_word_bitsset(map, status, CMD(0x40)))
+				break;
+			chip->state = FL_XIP_WHILE_ERASING;
+			chip->erase_suspended = 1;
+			map_write(map, CMD(0xf0), adr);
+			(void) map_read(map, adr);
+			asm volatile (".rep 8; nop; .endr");
+			local_irq_enable();
+			spin_unlock(chip->mutex);
+			asm volatile (".rep 8; nop; .endr");
+			cond_resched();
+
+			/*
+			 * We're back.  However someone else might have
+			 * decided to go write to the chip if we are in
+			 * a suspended erase state.  If so let's wait
+			 * until it's done.
+			 */
+			spin_lock(chip->mutex);
+			while (chip->state != FL_XIP_WHILE_ERASING) {
+				DECLARE_WAITQUEUE(wait, current);
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				add_wait_queue(&chip->wq, &wait);
+				spin_unlock(chip->mutex);
+				schedule();
+				remove_wait_queue(&chip->wq, &wait);
+				spin_lock(chip->mutex);
+			}
+			/* Disallow XIP again */
+			local_irq_disable();
+
+			/* Resume the write or erase operation */
+			map_write(map, CMD(0x30), adr);
+			chip->state = oldstate;
+			start = xip_currtime();
+		} else if (usec >= 1000000/HZ) {
+			/*
+			 * Try to save on CPU power when waiting delay
+			 * is at least a system timer tick period.
+			 * No need to be extremely accurate here.
+			 */
+			xip_cpu_idle();
+		}
+		status = map_read(map, adr);
+	} while (!map_word_andequal(map, status, OK, OK)
+		 && xip_elapsed_since(start) < usec);
+}
+
+#define UDELAY(map, chip, adr, usec)  xip_udelay(map, chip, adr, usec)
+
+/*
+ * The INVALIDATE_CACHED_RANGE() macro is normally used in parallel while
+ * the flash is actively programming or erasing since we have to poll for
+ * the operation to complete anyway.  We can't do that in a generic way with
+ * a XIP setup so do it before the actual flash operation in this case
+ * and stub it out from INVALIDATE_CACHE_UDELAY.
+ */
+#define XIP_INVAL_CACHED_RANGE(map, from, size)  \
+	INVALIDATE_CACHED_RANGE(map, from, size)
+
+#define INVALIDATE_CACHE_UDELAY(map, chip, adr, len, usec)  \
+	UDELAY(map, chip, adr, usec)
+
+/*
+ * Extra notes:
+ *
+ * Activating this XIP support changes the way the code works a bit.  For
+ * example the code to suspend the current process when concurrent access
+ * happens is never executed because xip_udelay() will always return with the
+ * same chip state as it was entered with.  This is why there is no care for
+ * the presence of add_wait_queue() or schedule() calls from within a couple
+ * xip_disable()'d  areas of code, like in do_erase_oneblock for example.
+ * The queueing and scheduling are always happening within xip_udelay().
+ *
+ * Similarly, get_chip() and put_chip() just happen to always be executed
+ * with chip->state set to FL_READY (or FL_XIP_WHILE_*) where flash state
+ * is in array mode, therefore never executing many cases therein and not
+ * causing any problem with XIP.
+ */
+
+#else
+
+#define xip_disable(map, chip, adr)
+#define xip_enable(map, chip, adr)
+#define XIP_INVAL_CACHED_RANGE(x...)
+
+#define UDELAY(map, chip, adr, usec)  \
+do {  \
+	spin_unlock(chip->mutex);  \
+	cfi_udelay(usec);  \
+	spin_lock(chip->mutex);  \
+} while (0)
+
+#define INVALIDATE_CACHE_UDELAY(map, chip, adr, len, usec)  \
+do {  \
+	spin_unlock(chip->mutex);  \
+	INVALIDATE_CACHED_RANGE(map, adr, len);  \
+	cfi_udelay(usec);  \
+	spin_lock(chip->mutex);  \
+} while (0)
+
+#endif
 
 static inline int do_read_onechip(struct map_info *map, struct flchip *chip, loff_t adr, size_t len, u_char *buf)
 {
@@ -563,10 +773,10 @@ static inline int do_read_onechip(struct map_info *map, struct flchip *chip, lof
 	/* Ensure cmd read/writes are aligned. */ 
 	cmd_addr = adr & ~(map_bankwidth(map)-1); 
 
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, cmd_addr, FL_READY);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
@@ -579,7 +789,7 @@ static inline int do_read_onechip(struct map_info *map, struct flchip *chip, lof
 
 	put_chip(map, chip, cmd_addr);
 
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 	return 0;
 }
 
@@ -633,7 +843,7 @@ static inline int do_read_secsi_onechip(struct map_info *map, struct flchip *chi
 	struct cfi_private *cfi = map->fldrv_priv;
 
  retry:
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 
 	if (chip->state != FL_READY){
 #if 0
@@ -642,7 +852,7 @@ static inline int do_read_secsi_onechip(struct map_info *map, struct flchip *chi
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		add_wait_queue(&chip->wq, &wait);
 		
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 
 		schedule();
 		remove_wait_queue(&chip->wq, &wait);
@@ -671,7 +881,7 @@ static inline int do_read_secsi_onechip(struct map_info *map, struct flchip *chi
 	cfi_send_gen_cmd(0x00, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
 	
 	wake_up(&chip->wq);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 
 	return 0;
 }
@@ -720,7 +930,7 @@ static int cfi_amdstd_secsi_read (struct mtd_info *mtd, loff_t from, size_t len,
 }
 
 
-static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned long adr, map_word datum)
+static int __xipram do_write_oneword(struct map_info *map, struct flchip *chip, unsigned long adr, map_word datum)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	unsigned long timeo = jiffies + HZ;
@@ -740,10 +950,10 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 
 	adr += chip->start;
 
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, adr, FL_WRITING);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
@@ -763,7 +973,9 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 		goto op_done;
 	}
 
+	XIP_INVAL_CACHED_RANGE(map, adr, map_bankwidth(map));
 	ENABLE_VPP(map);
+	xip_disable(map, chip, adr);
  retry:
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi, cfi->device_type, NULL);
@@ -771,9 +983,9 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 	map_write(map, datum, adr);
 	chip->state = FL_WRITING;
 
-	cfi_spin_unlock(chip->mutex);
-	cfi_udelay(chip->word_write_time);
-	cfi_spin_lock(chip->mutex);
+	INVALIDATE_CACHE_UDELAY(map, chip,
+				adr, map_bankwidth(map),
+				chip->word_write_time);
 
 	/* See comment above for timeout value. */
 	timeo = jiffies + uWriteTimeout; 
@@ -784,11 +996,11 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			schedule();
 			remove_wait_queue(&chip->wq, &wait);
 			timeo = jiffies + (HZ / 2); /* FIXME */
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			continue;
 		}
 
@@ -796,14 +1008,14 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 			break;
 
 		if (time_after(jiffies, timeo)) {
+			xip_enable(map, chip, adr);
 			printk(KERN_WARNING "MTD %s(): software timeout\n", __func__);
+			xip_disable(map, chip, adr);
                         break;
 		}
 
 		/* Latency issues. Drop the lock, wait a while and retry */
-		cfi_spin_unlock(chip->mutex);
-		cfi_udelay(1);
-		cfi_spin_lock(chip->mutex);
+		UDELAY(map, chip, adr, 1);
 	}
 	/* Did we succeed? */
 	if (!chip_good(map, adr, datum)) {
@@ -816,10 +1028,11 @@ static int do_write_oneword(struct map_info *map, struct flchip *chip, unsigned
 
 		ret = -EIO;
 	}
+	xip_enable(map, chip, adr);
  op_done:
 	chip->state = FL_READY;
 	put_chip(map, chip, adr);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 
 	return ret;
 }
@@ -851,7 +1064,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 		map_word tmp_buf;
 
  retry:
-		cfi_spin_lock(cfi->chips[chipnum].mutex);
+		spin_lock(cfi->chips[chipnum].mutex);
 
 		if (cfi->chips[chipnum].state != FL_READY) {
 #if 0
@@ -860,7 +1073,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&cfi->chips[chipnum].wq, &wait);
 
-			cfi_spin_unlock(cfi->chips[chipnum].mutex);
+			spin_unlock(cfi->chips[chipnum].mutex);
 
 			schedule();
 			remove_wait_queue(&cfi->chips[chipnum].wq, &wait);
@@ -874,7 +1087,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 		/* Load 'tmp_buf' with old contents of flash */
 		tmp_buf = map_read(map, bus_ofs+chipstart);
 
-		cfi_spin_unlock(cfi->chips[chipnum].mutex);
+		spin_unlock(cfi->chips[chipnum].mutex);
 
 		/* Number of bytes to copy from buffer */
 		n = min_t(int, len, map_bankwidth(map)-i);
@@ -929,7 +1142,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 		map_word tmp_buf;
 
  retry1:
-		cfi_spin_lock(cfi->chips[chipnum].mutex);
+		spin_lock(cfi->chips[chipnum].mutex);
 
 		if (cfi->chips[chipnum].state != FL_READY) {
 #if 0
@@ -938,7 +1151,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&cfi->chips[chipnum].wq, &wait);
 
-			cfi_spin_unlock(cfi->chips[chipnum].mutex);
+			spin_unlock(cfi->chips[chipnum].mutex);
 
 			schedule();
 			remove_wait_queue(&cfi->chips[chipnum].wq, &wait);
@@ -951,7 +1164,7 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 
 		tmp_buf = map_read(map, ofs + chipstart);
 
-		cfi_spin_unlock(cfi->chips[chipnum].mutex);
+		spin_unlock(cfi->chips[chipnum].mutex);
 
 		tmp_buf = map_word_load_partial(map, tmp_buf, buf, 0, len);
 	
@@ -970,8 +1183,9 @@ static int cfi_amdstd_write_words(struct mtd_info *mtd, loff_t to, size_t len,
 /*
  * FIXME: interleaved mode not tested, and probably not supported!
  */
-static inline int do_write_buffer(struct map_info *map, struct flchip *chip, 
-				  unsigned long adr, const u_char *buf, int len)
+static int __xipram do_write_buffer(struct map_info *map, struct flchip *chip,
+				    unsigned long adr, const u_char *buf, 
+				    int len)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	unsigned long timeo = jiffies + HZ;
@@ -985,10 +1199,10 @@ static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
 	adr += chip->start;
 	cmd_adr = adr;
 
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, adr, FL_WRITING);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
@@ -997,7 +1211,10 @@ static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
 	DEBUG( MTD_DEBUG_LEVEL3, "MTD %s(): WRITE 0x%.8lx(0x%.8lx)\n",
 	       __func__, adr, datum.x[0] );
 
+	XIP_INVAL_CACHED_RANGE(map, adr, len);
 	ENABLE_VPP(map);
+	xip_disable(map, chip, cmd_adr);
+	
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi, cfi->device_type, NULL);
 	//cfi_send_gen_cmd(0xA0, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
@@ -1027,9 +1244,9 @@ static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
 	map_write(map, CMD(0x29), cmd_adr);
 	chip->state = FL_WRITING;
 
-	cfi_spin_unlock(chip->mutex);
-	cfi_udelay(chip->buffer_write_time);
-	cfi_spin_lock(chip->mutex);
+	INVALIDATE_CACHE_UDELAY(map, chip,
+				adr, map_bankwidth(map),
+				chip->word_write_time);
 
 	timeo = jiffies + uWriteTimeout; 
 		
@@ -1040,38 +1257,39 @@ static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
 
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			schedule();
 			remove_wait_queue(&chip->wq, &wait);
 			timeo = jiffies + (HZ / 2); /* FIXME */
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			continue;
 		}
 
-		if (chip_ready(map, adr))
+		if (chip_ready(map, adr)) {
+			xip_enable(map, chip, adr);
 			goto op_done;
+		}
 		    
 		if( time_after(jiffies, timeo))
 			break;
 
 		/* Latency issues. Drop the lock, wait a while and retry */
-		cfi_spin_unlock(chip->mutex);
-		cfi_udelay(1);
-		cfi_spin_lock(chip->mutex);
+		UDELAY(map, chip, adr, 1);
 	}
 
-	printk(KERN_WARNING "MTD %s(): software timeout\n",
-	       __func__ );
-
 	/* reset on all failures. */
 	map_write( map, CMD(0xF0), chip->start );
+	xip_enable(map, chip, adr);
 	/* FIXME - should have reset delay before continuing */
 
+	printk(KERN_WARNING "MTD %s(): software timeout\n",
+	       __func__ );
+
 	ret = -EIO;
  op_done:
 	chip->state = FL_READY;
 	put_chip(map, chip, adr);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 
 	return ret;
 }
@@ -1161,7 +1379,7 @@ static int cfi_amdstd_write_buffers(struct mtd_info *mtd, loff_t to, size_t len,
  * Handle devices with one erase region, that only implement
  * the chip erase command.
  */
-static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
+static int __xipram do_erase_chip(struct map_info *map, struct flchip *chip)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	unsigned long timeo = jiffies + HZ;
@@ -1171,17 +1389,20 @@ static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
 
 	adr = cfi->addr_unlock1;
 
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, adr, FL_WRITING);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
 	DEBUG( MTD_DEBUG_LEVEL3, "MTD %s(): ERASE 0x%.8lx\n",
 	       __func__, chip->start );
 
+	XIP_INVAL_CACHED_RANGE(map, adr, map->size);
 	ENABLE_VPP(map);
+	xip_disable(map, chip, adr);
+
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x80, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
@@ -1193,9 +1414,9 @@ static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
 	chip->erase_suspended = 0;
 	chip->in_progress_block_addr = adr;
 
-	cfi_spin_unlock(chip->mutex);
-	msleep(chip->erase_time/2);
-	cfi_spin_lock(chip->mutex);
+	INVALIDATE_CACHE_UDELAY(map, chip,
+				adr, map->size,
+				chip->erase_time*500);
 
 	timeo = jiffies + (HZ*20);
 
@@ -1204,10 +1425,10 @@ static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
 			/* Someone's suspended the erase. Sleep */
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			schedule();
 			remove_wait_queue(&chip->wq, &wait);
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			continue;
 		}
 		if (chip->erase_suspended) {
@@ -1227,10 +1448,7 @@ static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
 		}
 
 		/* Latency issues. Drop the lock, wait a while and retry */
-		cfi_spin_unlock(chip->mutex);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(1);
-		cfi_spin_lock(chip->mutex);
+		UDELAY(map, chip, adr, 1000000/HZ);
 	}
 	/* Did we succeed? */
 	if (!chip_good(map, adr, map_word_ff(map))) {
@@ -1242,14 +1460,15 @@ static inline int do_erase_chip(struct map_info *map, struct flchip *chip)
 	}
 
 	chip->state = FL_READY;
+	xip_enable(map, chip, adr);
 	put_chip(map, chip, adr);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 
 	return ret;
 }
 
 
-static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, unsigned long adr, int len, void *thunk)
+static int __xipram do_erase_oneblock(struct map_info *map, struct flchip *chip, unsigned long adr, int len, void *thunk)
 {
 	struct cfi_private *cfi = map->fldrv_priv;
 	unsigned long timeo = jiffies + HZ;
@@ -1258,17 +1477,20 @@ static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, u
 
 	adr += chip->start;
 
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, adr, FL_ERASING);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
 	DEBUG( MTD_DEBUG_LEVEL3, "MTD %s(): ERASE 0x%.8lx\n",
 	       __func__, adr );
 
+	XIP_INVAL_CACHED_RANGE(map, adr, len);
 	ENABLE_VPP(map);
+	xip_disable(map, chip, adr);
+
 	cfi_send_gen_cmd(0xAA, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x55, cfi->addr_unlock2, chip->start, map, cfi, cfi->device_type, NULL);
 	cfi_send_gen_cmd(0x80, cfi->addr_unlock1, chip->start, map, cfi, cfi->device_type, NULL);
@@ -1279,10 +1501,10 @@ static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, u
 	chip->state = FL_ERASING;
 	chip->erase_suspended = 0;
 	chip->in_progress_block_addr = adr;
-	
-	cfi_spin_unlock(chip->mutex);
-	msleep(chip->erase_time/2);
-	cfi_spin_lock(chip->mutex);
+
+	INVALIDATE_CACHE_UDELAY(map, chip,
+				adr, len,
+				chip->erase_time*500);
 
 	timeo = jiffies + (HZ*20);
 
@@ -1291,10 +1513,10 @@ static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, u
 			/* Someone's suspended the erase. Sleep */
 			set_current_state(TASK_UNINTERRUPTIBLE);
 			add_wait_queue(&chip->wq, &wait);
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			schedule();
 			remove_wait_queue(&chip->wq, &wait);
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 			continue;
 		}
 		if (chip->erase_suspended) {
@@ -1304,20 +1526,20 @@ static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, u
 			chip->erase_suspended = 0;
 		}
 
-		if (chip_ready(map, adr))
+		if (chip_ready(map, adr)) {
+			xip_enable(map, chip, adr);
 			break;
+		}
 
 		if (time_after(jiffies, timeo)) {
+			xip_enable(map, chip, adr);
 			printk(KERN_WARNING "MTD %s(): software timeout\n",
 				__func__ );
 			break;
 		}
 
 		/* Latency issues. Drop the lock, wait a while and retry */
-		cfi_spin_unlock(chip->mutex);
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(1);
-		cfi_spin_lock(chip->mutex);
+		UDELAY(map, chip, adr, 1000000/HZ);
 	}
 	/* Did we succeed? */
 	if (!chip_good(map, adr, map_word_ff(map))) {
@@ -1330,7 +1552,7 @@ static inline int do_erase_oneblock(struct map_info *map, struct flchip *chip, u
 
 	chip->state = FL_READY;
 	put_chip(map, chip, adr);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 	return ret;
 }
 
@@ -1390,7 +1612,7 @@ static void cfi_amdstd_sync (struct mtd_info *mtd)
 		chip = &cfi->chips[i];
 
 	retry:
-		cfi_spin_lock(chip->mutex);
+		spin_lock(chip->mutex);
 
 		switch(chip->state) {
 		case FL_READY:
@@ -1404,14 +1626,14 @@ static void cfi_amdstd_sync (struct mtd_info *mtd)
 			 * with the chip now anyway.
 			 */
 		case FL_SYNCING:
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 			break;
 
 		default:
 			/* Not an idle state */
 			add_wait_queue(&chip->wq, &wait);
 			
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 
 			schedule();
 
@@ -1426,13 +1648,13 @@ static void cfi_amdstd_sync (struct mtd_info *mtd)
 	for (i--; i >=0; i--) {
 		chip = &cfi->chips[i];
 
-		cfi_spin_lock(chip->mutex);
+		spin_lock(chip->mutex);
 		
 		if (chip->state == FL_SYNCING) {
 			chip->state = chip->oldstate;
 			wake_up(&chip->wq);
 		}
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 	}
 }
 
@@ -1448,7 +1670,7 @@ static int cfi_amdstd_suspend(struct mtd_info *mtd)
 	for (i=0; !ret && i<cfi->numchips; i++) {
 		chip = &cfi->chips[i];
 
-		cfi_spin_lock(chip->mutex);
+		spin_lock(chip->mutex);
 
 		switch(chip->state) {
 		case FL_READY:
@@ -1468,7 +1690,7 @@ static int cfi_amdstd_suspend(struct mtd_info *mtd)
 			ret = -EAGAIN;
 			break;
 		}
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 	}
 
 	/* Unlock the chips again */
@@ -1477,13 +1699,13 @@ static int cfi_amdstd_suspend(struct mtd_info *mtd)
 		for (i--; i >=0; i--) {
 			chip = &cfi->chips[i];
 
-			cfi_spin_lock(chip->mutex);
+			spin_lock(chip->mutex);
 		
 			if (chip->state == FL_PM_SUSPENDED) {
 				chip->state = chip->oldstate;
 				wake_up(&chip->wq);
 			}
-			cfi_spin_unlock(chip->mutex);
+			spin_unlock(chip->mutex);
 		}
 	}
 	
@@ -1502,7 +1724,7 @@ static void cfi_amdstd_resume(struct mtd_info *mtd)
 	
 		chip = &cfi->chips[i];
 
-		cfi_spin_lock(chip->mutex);
+		spin_lock(chip->mutex);
 		
 		if (chip->state == FL_PM_SUSPENDED) {
 			chip->state = FL_READY;
@@ -1512,7 +1734,7 @@ static void cfi_amdstd_resume(struct mtd_info *mtd)
 		else
 			printk(KERN_ERR "Argh. Chip not in PM_SUSPENDED state upon resume()\n");
 
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 	}
 }
 
diff --git a/drivers/mtd/chips/fwh_lock.h b/drivers/mtd/chips/fwh_lock.h
index fbf44708a861..e1a5b76596c5 100644
--- a/drivers/mtd/chips/fwh_lock.h
+++ b/drivers/mtd/chips/fwh_lock.h
@@ -58,10 +58,10 @@ static int fwh_xxlock_oneblock(struct map_info *map, struct flchip *chip,
 	 * to flash memory - that means that we don't have to check status
 	 * and timeout.
 	 */
-	cfi_spin_lock(chip->mutex);
+	spin_lock(chip->mutex);
 	ret = get_chip(map, chip, adr, FL_LOCKING);
 	if (ret) {
-		cfi_spin_unlock(chip->mutex);
+		spin_unlock(chip->mutex);
 		return ret;
 	}
 
@@ -71,7 +71,7 @@ static int fwh_xxlock_oneblock(struct map_info *map, struct flchip *chip,
 	/* Done and happy. */
 	chip->state = FL_READY;
 	put_chip(map, chip, adr);
-	cfi_spin_unlock(chip->mutex);
+	spin_unlock(chip->mutex);
 	return 0;
 }
 
diff --git a/drivers/mtd/maps/map_funcs.c b/drivers/mtd/maps/map_funcs.c
index 38f6a7af53f8..9105e6ca0aa6 100644
--- a/drivers/mtd/maps/map_funcs.c
+++ b/drivers/mtd/maps/map_funcs.c
@@ -1,5 +1,5 @@
 /*
- * $Id: map_funcs.c,v 1.9 2004/07/13 22:33:15 dwmw2 Exp $
+ * $Id: map_funcs.c,v 1.10 2005/06/06 23:04:36 tpoynor Exp $
  *
  * Out-of-line map I/O functions for simple maps when CONFIG_COMPLEX_MAPPINGS
  * is enabled.
@@ -9,23 +9,24 @@
 #include <linux/module.h>
 
 #include <linux/mtd/map.h>
+#include <linux/mtd/xip.h>
 
-static map_word simple_map_read(struct map_info *map, unsigned long ofs)
+static map_word __xipram simple_map_read(struct map_info *map, unsigned long ofs)
 {
 	return inline_map_read(map, ofs);
 }
 
-static void simple_map_write(struct map_info *map, const map_word datum, unsigned long ofs)
+static void __xipram simple_map_write(struct map_info *map, const map_word datum, unsigned long ofs)
 {
 	inline_map_write(map, datum, ofs);
 }
 
-static void simple_map_copy_from(struct map_info *map, void *to, unsigned long from, ssize_t len)
+static void __xipram simple_map_copy_from(struct map_info *map, void *to, unsigned long from, ssize_t len)
 {
 	inline_map_copy_from(map, to, from, len);
 }
 
-static void simple_map_copy_to(struct map_info *map, unsigned long to, const void *from, ssize_t len)
+static void __xipram simple_map_copy_to(struct map_info *map, unsigned long to, const void *from, ssize_t len)
 {
 	inline_map_copy_to(map, to, from, len);
 }
diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h
index 66e0a32efbac..e6b6a1c66bd5 100644
--- a/include/linux/mtd/cfi.h
+++ b/include/linux/mtd/cfi.h
@@ -1,7 +1,7 @@
 
 /* Common Flash Interface structures 
  * See http://support.intel.com/design/flash/technote/index.htm
- * $Id: cfi.h,v 1.53 2005/03/15 19:03:13 gleixner Exp $
+ * $Id: cfi.h,v 1.54 2005/06/06 23:04:36 tpoynor Exp $
  */
 
 #ifndef __MTD_CFI_H__
@@ -428,16 +428,6 @@ static inline void cfi_udelay(int us)
 	}
 }
 
-static inline void cfi_spin_lock(spinlock_t *mutex)
-{
-	spin_lock_bh(mutex);
-}
-
-static inline void cfi_spin_unlock(spinlock_t *mutex)
-{
-	spin_unlock_bh(mutex);
-}
-
 struct cfi_extquery *cfi_read_pri(struct map_info *map, uint16_t adr, uint16_t size,
 			     const char* name);
 struct cfi_fixup {
-- 
cgit v1.2.3-59-g8ed1b


From bfabb98688e7089381baa0974f7ff6786ce2a2d0 Mon Sep 17 00:00:00 2001
From: Sean Young <sean@mess.org>
Date: Mon, 13 Jun 2005 14:08:48 +0100
Subject: [MTD] Use correct major number for INFTL

inftl was assigned new major number 96, 94 is in use by dasd. See:
http://www.ussg.iu.edu/hypermail/linux/kernel/0409.2/1220.html

Signed-off-by: Sean Young <sean@mess.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/mtd/inftl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/inftl.h b/include/linux/mtd/inftl.h
index b52c8cbd235c..0268125a6271 100644
--- a/include/linux/mtd/inftl.h
+++ b/include/linux/mtd/inftl.h
@@ -3,7 +3,7 @@
  *
  *	(C) Copyright 2002, Greg Ungerer (gerg@snapgear.com)
  *
- *	$Id: inftl.h,v 1.6 2004/06/30 14:49:00 dbrown Exp $
+ *	$Id: inftl.h,v 1.7 2005/06/13 13:08:45 sean Exp $
  */
 
 #ifndef __MTD_INFTL_H__
@@ -20,7 +20,7 @@
 #include <mtd/inftl-user.h>
 
 #ifndef INFTL_MAJOR
-#define INFTL_MAJOR 94
+#define INFTL_MAJOR 96
 #endif
 #define INFTL_PARTN_BITS 4
 
-- 
cgit v1.2.3-59-g8ed1b


From 0edb586049e57c56e625536476931117a57671e9 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cohuck@de.ibm.com>
Date: Wed, 22 Jun 2005 16:59:51 +0200
Subject: [PATCH] driver core: add bus_find_device & driver_find_device
 functions

Add bus_find_device() and driver_find_device() which allow searching for a
device in the bus's resp. the driver's klist and obtain a reference on it.

Signed-off-by: Cornelia Huck <cohuck@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/bus.c     | 34 ++++++++++++++++++++++++++++++++++
 drivers/base/driver.c  | 35 +++++++++++++++++++++++++++++++++++
 include/linux/device.h |  5 +++++
 3 files changed, 74 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index c3fac7fd555e..2c64b792d074 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -177,6 +177,39 @@ int bus_for_each_dev(struct bus_type * bus, struct device * start,
 	return error;
 }
 
+/**
+ * bus_find_device - device iterator for locating a particular device.
+ * @bus: bus type
+ * @start: Device to begin with
+ * @data: Data to pass to match function
+ * @match: Callback function to check device
+ *
+ * This is similar to the bus_for_each_dev() function above, but it
+ * returns a reference to a device that is 'found' for later use, as
+ * determined by the @match callback.
+ *
+ * The callback should return 0 if the device doesn't match and non-zero
+ * if it does.  If the callback returns non-zero, this function will
+ * return to the caller and not iterate over any more devices.
+ */
+struct device * bus_find_device(struct bus_type *bus,
+				struct device *start, void *data,
+				int (*match)(struct device *, void *))
+{
+	struct klist_iter i;
+	struct device *dev;
+
+	if (!bus)
+		return NULL;
+
+	klist_iter_init_node(&bus->klist_devices, &i,
+			     (start ? &start->knode_bus : NULL));
+	while ((dev = next_device(&i)))
+		if (match(dev, data) && get_device(dev))
+			break;
+	klist_iter_exit(&i);
+	return dev;
+}
 
 
 static struct device_driver * next_driver(struct klist_iter * i)
@@ -557,6 +590,7 @@ int __init buses_init(void)
 
 
 EXPORT_SYMBOL_GPL(bus_for_each_dev);
+EXPORT_SYMBOL_GPL(bus_find_device);
 EXPORT_SYMBOL_GPL(bus_for_each_drv);
 
 EXPORT_SYMBOL_GPL(bus_add_device);
diff --git a/drivers/base/driver.c b/drivers/base/driver.c
index 1b645886e9eb..291c5954a3af 100644
--- a/drivers/base/driver.c
+++ b/drivers/base/driver.c
@@ -55,6 +55,41 @@ int driver_for_each_device(struct device_driver * drv, struct device * start,
 EXPORT_SYMBOL_GPL(driver_for_each_device);
 
 
+/**
+ * driver_find_device - device iterator for locating a particular device.
+ * @driver: The device's driver
+ * @start: Device to begin with
+ * @data: Data to pass to match function
+ * @match: Callback function to check device
+ *
+ * This is similar to the driver_for_each_device() function above, but
+ * it returns a reference to a device that is 'found' for later use, as
+ * determined by the @match callback.
+ *
+ * The callback should return 0 if the device doesn't match and non-zero
+ * if it does.  If the callback returns non-zero, this function will
+ * return to the caller and not iterate over any more devices.
+ */
+struct device * driver_find_device(struct device_driver *drv,
+				   struct device * start, void * data,
+				   int (*match)(struct device *, void *))
+{
+	struct klist_iter i;
+	struct device *dev;
+
+	if (!drv)
+		return NULL;
+
+	klist_iter_init_node(&drv->klist_devices, &i,
+			     (start ? &start->knode_driver : NULL));
+	while ((dev = next_device(&i)))
+		if (match(dev, data) && get_device(dev))
+			break;
+	klist_iter_exit(&i);
+	return dev;
+}
+EXPORT_SYMBOL_GPL(driver_find_device);
+
 /**
  *	driver_create_file - create sysfs file for driver.
  *	@drv:	driver.
diff --git a/include/linux/device.h b/include/linux/device.h
index 7b781a72b293..07222c531d37 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -80,6 +80,8 @@ extern struct bus_type * find_bus(char * name);
 
 int bus_for_each_dev(struct bus_type * bus, struct device * start, void * data,
 		     int (*fn)(struct device *, void *));
+struct device * bus_find_device(struct bus_type *bus, struct device *start,
+				void *data, int (*match)(struct device *, void *));
 
 int bus_for_each_drv(struct bus_type * bus, struct device_driver * start, 
 		     void * data, int (*fn)(struct device_driver *, void *));
@@ -142,6 +144,9 @@ extern void driver_remove_file(struct device_driver *, struct driver_attribute *
 
 extern int driver_for_each_device(struct device_driver * drv, struct device * start,
 				  void * data, int (*fn)(struct device *, void *));
+struct device * driver_find_device(struct device_driver *drv,
+				   struct device *start, void *data,
+				   int (*match)(struct device *, void *));
 
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 23d3d602cb96addd3c1158424fb01a49ea5e81b1 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Wed, 22 Jun 2005 16:09:05 -0700
Subject: [PATCH] driver core: change bus_rescan_devices to return void

No one was looking at the return value of bus_rescan_devices, and it
really wasn't anything that anyone in the kernel would ever care about.
So change it which enabled some counting code to be removed also.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/base/bus.c     | 27 +++++++++------------------
 include/linux/device.h |  2 +-
 2 files changed, 10 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 7e17488271a8..96fe2f956754 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -483,31 +483,22 @@ void bus_remove_driver(struct device_driver * drv)
 /* Helper for bus_rescan_devices's iter */
 static int bus_rescan_devices_helper(struct device *dev, void *data)
 {
-	int *count = data;
-
-	if (!dev->driver && (device_attach(dev) > 0))
-		(*count)++;
-
+	if (!dev->driver)
+		device_attach(dev);
 	return 0;
 }
 
-
 /**
- *	bus_rescan_devices - rescan devices on the bus for possible drivers
- *	@bus:	the bus to scan.
+ * bus_rescan_devices - rescan devices on the bus for possible drivers
+ * @bus: the bus to scan.
  *
- *	This function will look for devices on the bus with no driver
- *	attached and rescan it against existing drivers to see if it
- *	matches any. Calls device_attach(). Returns the number of devices
- *	that were sucessfully bound to a driver.
+ * This function will look for devices on the bus with no driver
+ * attached and rescan it against existing drivers to see if it matches
+ * any by calling device_attach() for the unbound devices.
  */
-int bus_rescan_devices(struct bus_type * bus)
+void bus_rescan_devices(struct bus_type * bus)
 {
-	int count = 0;
-
-	bus_for_each_dev(bus, NULL, &count, bus_rescan_devices_helper);
-
-	return count;
+	bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
 }
 
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 07222c531d37..f378c846e6d5 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -69,7 +69,7 @@ struct bus_type {
 extern int bus_register(struct bus_type * bus);
 extern void bus_unregister(struct bus_type * bus);
 
-extern int bus_rescan_devices(struct bus_type * bus);
+extern void bus_rescan_devices(struct bus_type * bus);
 
 extern struct bus_type * get_bus(struct bus_type * bus);
 extern void put_bus(struct bus_type * bus);
-- 
cgit v1.2.3-59-g8ed1b


From 0048e6030d41453c2f5ce0e9aead910d46cfd448 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Thu, 30 Jun 2005 00:48:14 -0500
Subject: Input: uinput - use completions instead of events and manual       
 wakeups in force feedback code.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/input/misc/uinput.c | 81 ++++++++++++++++++++++++---------------------
 include/linux/uinput.h      |  5 ++-
 2 files changed, 45 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index 9c3d20073ae3..c3eebf593ab6 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -53,24 +53,23 @@ static int uinput_dev_event(struct input_dev *dev, unsigned int type, unsigned i
 	return 0;
 }
 
-static int uinput_request_alloc_id(struct input_dev *dev, struct uinput_request *request)
+static int uinput_request_alloc_id(struct uinput_device *udev, struct uinput_request *request)
 {
 	/* Atomically allocate an ID for the given request. Returns 0 on success. */
-	struct uinput_device *udev = dev->private;
 	int id;
 	int err = -1;
 
-	down(&udev->requests_sem);
+	spin_lock(&udev->requests_lock);
 
 	for (id = 0; id < UINPUT_NUM_REQUESTS; id++)
 		if (!udev->requests[id]) {
-			udev->requests[id] = request;
 			request->id = id;
+			udev->requests[id] = request;
 			err = 0;
 			break;
 		}
 
-	up(&udev->requests_sem);
+	spin_unlock(&udev->requests_lock);
 	return err;
 }
 
@@ -79,70 +78,78 @@ static struct uinput_request* uinput_request_find(struct uinput_device *udev, in
 	/* Find an input request, by ID. Returns NULL if the ID isn't valid. */
 	if (id >= UINPUT_NUM_REQUESTS || id < 0)
 		return NULL;
-	if (udev->requests[id]->completed)
-		return NULL;
 	return udev->requests[id];
 }
 
-static void uinput_request_init(struct input_dev *dev, struct uinput_request *request, int code)
+static inline int uinput_request_reserve_slot(struct uinput_device *udev, struct uinput_request *request)
 {
-	struct uinput_device *udev = dev->private;
+	/* Allocate slot. If none are available right away, wait. */
+	return wait_event_interruptible(udev->requests_waitq,
+					!uinput_request_alloc_id(udev, request));
+}
 
-	memset(request, 0, sizeof(struct uinput_request));
-	request->code = code;
-	init_waitqueue_head(&request->waitq);
+static void uinput_request_done(struct uinput_device *udev, struct uinput_request *request)
+{
+	complete(&request->done);
 
-	/* Allocate an ID. If none are available right away, wait. */
-	request->retval = wait_event_interruptible(udev->requests_waitq,
-					!uinput_request_alloc_id(dev, request));
+	/* Mark slot as available */
+	udev->requests[request->id] = NULL;
+	wake_up_interruptible(&udev->requests_waitq);
 }
 
-static void uinput_request_submit(struct input_dev *dev, struct uinput_request *request)
+static int uinput_request_submit(struct input_dev *dev, struct uinput_request *request)
 {
-	struct uinput_device *udev = dev->private;
 	int retval;
 
 	/* Tell our userspace app about this new request by queueing an input event */
 	uinput_dev_event(dev, EV_UINPUT, request->code, request->id);
 
 	/* Wait for the request to complete */
-	retval = wait_event_interruptible(request->waitq, request->completed);
-	if (retval)
-		request->retval = retval;
+	retval = wait_for_completion_interruptible(&request->done);
+	if (!retval)
+		retval = request->retval;
 
-	/* Release this request's ID, let others know it's available */
-	udev->requests[request->id] = NULL;
-	wake_up_interruptible(&udev->requests_waitq);
+	return retval;
 }
 
 static int uinput_dev_upload_effect(struct input_dev *dev, struct ff_effect *effect)
 {
 	struct uinput_request request;
+	int retval;
 
 	if (!test_bit(EV_FF, dev->evbit))
 		return -ENOSYS;
 
-	uinput_request_init(dev, &request, UI_FF_UPLOAD);
-	if (request.retval)
-		return request.retval;
+	request.id = -1;
+	init_completion(&request.done);
+	request.code = UI_FF_UPLOAD;
 	request.u.effect = effect;
-	uinput_request_submit(dev, &request);
-	return request.retval;
+
+	retval = uinput_request_reserve_slot(dev->private, &request);
+	if (!retval)
+		retval = uinput_request_submit(dev, &request);
+
+	return retval;
 }
 
 static int uinput_dev_erase_effect(struct input_dev *dev, int effect_id)
 {
 	struct uinput_request request;
+	int retval;
 
 	if (!test_bit(EV_FF, dev->evbit))
 		return -ENOSYS;
 
-	uinput_request_init(dev, &request, UI_FF_ERASE);
-	if (request.retval)
-		return request.retval;
+	request.id = -1;
+	init_completion(&request.done);
+	request.code = UI_FF_ERASE;
 	request.u.effect_id = effect_id;
-	uinput_request_submit(dev, &request);
-	return request.retval;
+
+	retval = uinput_request_reserve_slot(dev->private, &request);
+	if (!retval)
+		retval = uinput_request_submit(dev, &request);
+
+	return retval;
 }
 
 static int uinput_create_device(struct uinput_device *udev)
@@ -189,7 +196,7 @@ static int uinput_open(struct inode *inode, struct file *file)
 	if (!newdev)
 		goto error;
 	memset(newdev, 0, sizeof(struct uinput_device));
-	init_MUTEX(&newdev->requests_sem);
+	spin_lock_init(&newdev->requests_lock);
 	init_waitqueue_head(&newdev->requests_waitq);
 
 	newinput = kmalloc(sizeof(struct input_dev), GFP_KERNEL);
@@ -551,8 +558,7 @@ static int uinput_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 			}
 			req->retval = ff_up.retval;
 			memcpy(req->u.effect, &ff_up.effect, sizeof(struct ff_effect));
-			req->completed = 1;
-			wake_up_interruptible(&req->waitq);
+			uinput_request_done(udev, req);
 			break;
 
 		case UI_END_FF_ERASE:
@@ -566,8 +572,7 @@ static int uinput_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 				break;
 			}
 			req->retval = ff_erase.retval;
-			req->completed = 1;
-			wake_up_interruptible(&req->waitq);
+			uinput_request_done(udev, req);
 			break;
 
 		default:
diff --git a/include/linux/uinput.h b/include/linux/uinput.h
index 4c2c82336d10..84876077027f 100644
--- a/include/linux/uinput.h
+++ b/include/linux/uinput.h
@@ -42,8 +42,7 @@ struct uinput_request {
 	int			code;	/* UI_FF_UPLOAD, UI_FF_ERASE */
 
 	int			retval;
-	wait_queue_head_t	waitq;
-	int			completed;
+	struct completion	done;
 
 	union {
 		int		effect_id;
@@ -62,7 +61,7 @@ struct uinput_device {
 
 	struct uinput_request	*requests[UINPUT_NUM_REQUESTS];
 	wait_queue_head_t	requests_waitq;
-	struct semaphore	requests_sem;
+	spinlock_t		requests_lock;
 };
 #endif	/* __KERNEL__ */
 
-- 
cgit v1.2.3-59-g8ed1b


From 16a334c0de5a94b1d10a1ac9a33f4dedac89a075 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Thu, 30 Jun 2005 00:49:08 -0500
Subject: Input: introduce usb_to_input_id() to uniformly produce        struct
 input_id for USB input devices.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/usb/input/acecad.c      |  6 ++----
 drivers/usb/input/aiptek.c      |  6 ++----
 drivers/usb/input/ati_remote.c  |  8 +++-----
 drivers/usb/input/hid-input.c   |  6 ++----
 drivers/usb/input/itmtouch.c    |  6 ++----
 drivers/usb/input/kbtab.c       |  6 ++----
 drivers/usb/input/mtouchusb.c   |  6 ++----
 drivers/usb/input/powermate.c   |  6 ++----
 drivers/usb/input/touchkitusb.c |  7 ++-----
 drivers/usb/input/usbkbd.c      |  6 ++----
 drivers/usb/input/usbmouse.c    |  6 ++----
 drivers/usb/input/wacom.c       |  6 ++----
 drivers/usb/input/xpad.c        |  6 ++----
 drivers/usb/media/konicawc.c    |  6 ++----
 include/linux/usb_input.h       | 25 +++++++++++++++++++++++++
 15 files changed, 54 insertions(+), 58 deletions(-)
 create mode 100644 include/linux/usb_input.h

(limited to 'include/linux')

diff --git a/drivers/usb/input/acecad.c b/drivers/usb/input/acecad.c
index 68039b04af3b..13532f3e3efc 100644
--- a/drivers/usb/input/acecad.c
+++ b/drivers/usb/input/acecad.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 /*
  * Version Information
@@ -212,10 +213,7 @@ static int usb_acecad_probe(struct usb_interface *intf, const struct usb_device_
 
 	acecad->dev.name = acecad->name;
 	acecad->dev.phys = acecad->phys;
-	acecad->dev.id.bustype = BUS_USB;
-	acecad->dev.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-	acecad->dev.id.product = le16_to_cpu(dev->descriptor.idProduct);
-	acecad->dev.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+	usb_to_input_id(dev, &acecad->dev.id);
 	acecad->dev.dev = &intf->dev;
 
 	usb_fill_int_urb(acecad->irq, dev, pipe,
diff --git a/drivers/usb/input/aiptek.c b/drivers/usb/input/aiptek.c
index 6bb0f25e8e93..cd0cbfe20723 100644
--- a/drivers/usb/input/aiptek.c
+++ b/drivers/usb/input/aiptek.c
@@ -77,6 +77,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/unaligned.h>
@@ -2125,10 +2126,7 @@ aiptek_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	aiptek->inputdev.absflat[ABS_WHEEL] = 0;
 	aiptek->inputdev.name = "Aiptek";
 	aiptek->inputdev.phys = aiptek->features.usbPath;
-	aiptek->inputdev.id.bustype = BUS_USB;
-	aiptek->inputdev.id.vendor = le16_to_cpu(usbdev->descriptor.idVendor);
-	aiptek->inputdev.id.product = le16_to_cpu(usbdev->descriptor.idProduct);
-	aiptek->inputdev.id.version = le16_to_cpu(usbdev->descriptor.bcdDevice);
+	usb_to_input_id(usbdev, &aiptek->inputdev.id);
 	aiptek->inputdev.dev = &intf->dev;
 
 	aiptek->usbdev = usbdev;
diff --git a/drivers/usb/input/ati_remote.c b/drivers/usb/input/ati_remote.c
index 654ac454744d..fd99681ee483 100644
--- a/drivers/usb/input/ati_remote.c
+++ b/drivers/usb/input/ati_remote.c
@@ -94,6 +94,7 @@
 #include <linux/moduleparam.h>
 #include <linux/input.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 #include <linux/wait.h>
 
 /*
@@ -635,11 +636,8 @@ static void ati_remote_input_init(struct ati_remote *ati_remote)
 	idev->name = ati_remote->name;
 	idev->phys = ati_remote->phys;
 
-	idev->id.bustype = BUS_USB;
-	idev->id.vendor = le16_to_cpu(ati_remote->udev->descriptor.idVendor);
-	idev->id.product = le16_to_cpu(ati_remote->udev->descriptor.idProduct);
-	idev->id.version = le16_to_cpu(ati_remote->udev->descriptor.bcdDevice);
-	idev->dev = &(ati_remote->udev->dev);
+	usb_to_input_id(ati_remote->udev, &idev->id);
+	idev->dev = &ati_remote->udev->dev;
 }
 
 static int ati_remote_initialize(struct ati_remote *ati_remote)
diff --git a/drivers/usb/input/hid-input.c b/drivers/usb/input/hid-input.c
index 9ac1e9095334..e071c8eeccee 100644
--- a/drivers/usb/input/hid-input.c
+++ b/drivers/usb/input/hid-input.c
@@ -31,6 +31,7 @@
 #include <linux/kernel.h>
 #include <linux/input.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 #undef DEBUG
 
@@ -581,10 +582,7 @@ int hidinput_connect(struct hid_device *hid)
 				hidinput->input.name = hid->name;
 				hidinput->input.phys = hid->phys;
 				hidinput->input.uniq = hid->uniq;
-				hidinput->input.id.bustype = BUS_USB;
-				hidinput->input.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-				hidinput->input.id.product = le16_to_cpu(dev->descriptor.idProduct);
-				hidinput->input.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+				usb_to_input_id(dev, &hidinput->input.id);
 				hidinput->input.dev = &hid->intf->dev;
 			}
 
diff --git a/drivers/usb/input/itmtouch.c b/drivers/usb/input/itmtouch.c
index 47dec6a1b344..0dc439f10823 100644
--- a/drivers/usb/input/itmtouch.c
+++ b/drivers/usb/input/itmtouch.c
@@ -53,6 +53,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 /* only an 8 byte buffer necessary for a single packet */
 #define ITM_BUFSIZE			8
@@ -184,10 +185,7 @@ static int itmtouch_probe(struct usb_interface *intf, const struct usb_device_id
 
 	itmtouch->inputdev.name = itmtouch->name;
 	itmtouch->inputdev.phys = itmtouch->phys;
-	itmtouch->inputdev.id.bustype = BUS_USB;
-	itmtouch->inputdev.id.vendor = udev->descriptor.idVendor;
-	itmtouch->inputdev.id.product = udev->descriptor.idProduct;
-	itmtouch->inputdev.id.version = udev->descriptor.bcdDevice;
+	usb_to_input_id(udev, &itmtouch->inputdev.id);
 	itmtouch->inputdev.dev = &intf->dev;
 
 	if (!strlen(itmtouch->name))
diff --git a/drivers/usb/input/kbtab.c b/drivers/usb/input/kbtab.c
index d2f0f90a9bcd..b6f6ac8d9c2f 100644
--- a/drivers/usb/input/kbtab.c
+++ b/drivers/usb/input/kbtab.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
 
@@ -167,10 +168,7 @@ static int kbtab_probe(struct usb_interface *intf, const struct usb_device_id *i
 
 	kbtab->dev.name = "KB Gear Tablet";
 	kbtab->dev.phys = kbtab->phys;
-	kbtab->dev.id.bustype = BUS_USB;
-	kbtab->dev.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-	kbtab->dev.id.product = le16_to_cpu(dev->descriptor.idProduct);
-	kbtab->dev.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+	usb_to_input_id(dev, &kbtab->dev.id);
 	kbtab->dev.dev = &intf->dev;
 	kbtab->usbdev = dev;
 
diff --git a/drivers/usb/input/mtouchusb.c b/drivers/usb/input/mtouchusb.c
index 09b5cc7c66de..ff9275057a18 100644
--- a/drivers/usb/input/mtouchusb.c
+++ b/drivers/usb/input/mtouchusb.c
@@ -53,6 +53,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 #define MTOUCHUSB_MIN_XC                0x0
 #define MTOUCHUSB_MAX_RAW_XC            0x4000
@@ -232,10 +233,7 @@ static int mtouchusb_probe(struct usb_interface *intf, const struct usb_device_i
 
 	mtouch->input.name = mtouch->name;
 	mtouch->input.phys = mtouch->phys;
-	mtouch->input.id.bustype = BUS_USB;
-	mtouch->input.id.vendor = le16_to_cpu(udev->descriptor.idVendor);
-	mtouch->input.id.product = le16_to_cpu(udev->descriptor.idProduct);
-	mtouch->input.id.version = le16_to_cpu(udev->descriptor.bcdDevice);
+	usb_to_input_id(udev, &mtouch->input.id);
 	mtouch->input.dev = &intf->dev;
 
 	mtouch->input.evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
diff --git a/drivers/usb/input/powermate.c b/drivers/usb/input/powermate.c
index 3975b309d55f..ad4afe7e5897 100644
--- a/drivers/usb/input/powermate.c
+++ b/drivers/usb/input/powermate.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 #define POWERMATE_VENDOR	0x077d	/* Griffin Technology, Inc. */
 #define POWERMATE_PRODUCT_NEW	0x0410	/* Griffin PowerMate */
@@ -389,10 +390,7 @@ static int powermate_probe(struct usb_interface *intf, const struct usb_device_i
 	pm->input.keybit[LONG(BTN_0)] = BIT(BTN_0);
 	pm->input.relbit[LONG(REL_DIAL)] = BIT(REL_DIAL);
 	pm->input.mscbit[LONG(MSC_PULSELED)] = BIT(MSC_PULSELED);
-	pm->input.id.bustype = BUS_USB;
-	pm->input.id.vendor = le16_to_cpu(udev->descriptor.idVendor);
-	pm->input.id.product = le16_to_cpu(udev->descriptor.idProduct);
-	pm->input.id.version = le16_to_cpu(udev->descriptor.bcdDevice);
+	usb_to_input_id(udev, &pm->input.id);
 	pm->input.event = powermate_input_event;
 	pm->input.dev = &intf->dev;
 	pm->input.phys = pm->phys;
diff --git a/drivers/usb/input/touchkitusb.c b/drivers/usb/input/touchkitusb.c
index 386595ee21c0..4276c24a5080 100644
--- a/drivers/usb/input/touchkitusb.c
+++ b/drivers/usb/input/touchkitusb.c
@@ -35,7 +35,7 @@
 #define DEBUG
 #endif
 #include <linux/usb.h>
-
+#include <linux/usb_input.h>
 
 #define TOUCHKIT_MIN_XC			0x0
 #define TOUCHKIT_MAX_XC			0x07ff
@@ -202,10 +202,7 @@ static int touchkit_probe(struct usb_interface *intf,
 
 	touchkit->input.name = touchkit->name;
 	touchkit->input.phys = touchkit->phys;
-	touchkit->input.id.bustype = BUS_USB;
-	touchkit->input.id.vendor = le16_to_cpu(udev->descriptor.idVendor);
-	touchkit->input.id.product = le16_to_cpu(udev->descriptor.idProduct);
-	touchkit->input.id.version = le16_to_cpu(udev->descriptor.bcdDevice);
+	usb_to_input_id(udev, &touchkit->input.id);
 	touchkit->input.dev = &intf->dev;
 
 	touchkit->input.evbit[0] = BIT(EV_KEY) | BIT(EV_ABS);
diff --git a/drivers/usb/input/usbkbd.c b/drivers/usb/input/usbkbd.c
index f35db1974c42..28987f15eeee 100644
--- a/drivers/usb/input/usbkbd.c
+++ b/drivers/usb/input/usbkbd.c
@@ -32,6 +32,7 @@
 #include <linux/input.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 /*
  * Version Information
@@ -288,10 +289,7 @@ static int usb_kbd_probe(struct usb_interface *iface,
 
 	kbd->dev.name = kbd->name;
 	kbd->dev.phys = kbd->phys;
-	kbd->dev.id.bustype = BUS_USB;
-	kbd->dev.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-	kbd->dev.id.product = le16_to_cpu(dev->descriptor.idProduct);
-	kbd->dev.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+	usb_to_input_id(dev, &kbd->dev.id);
 	kbd->dev.dev = &iface->dev;
 
 	if (dev->manufacturer)
diff --git a/drivers/usb/input/usbmouse.c b/drivers/usb/input/usbmouse.c
index 1ec41b5effe6..4104dec847fb 100644
--- a/drivers/usb/input/usbmouse.c
+++ b/drivers/usb/input/usbmouse.c
@@ -32,6 +32,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 /*
  * Version Information
@@ -171,10 +172,7 @@ static int usb_mouse_probe(struct usb_interface * intf, const struct usb_device_
 
 	mouse->dev.name = mouse->name;
 	mouse->dev.phys = mouse->phys;
-	mouse->dev.id.bustype = BUS_USB;
-	mouse->dev.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-	mouse->dev.id.product = le16_to_cpu(dev->descriptor.idProduct);
-	mouse->dev.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+	usb_to_input_id(dev, &mouse->dev.id);
 	mouse->dev.dev = &intf->dev;
 
 	if (dev->manufacturer)
diff --git a/drivers/usb/input/wacom.c b/drivers/usb/input/wacom.c
index f6b34af66b3d..02412e31a46b 100644
--- a/drivers/usb/input/wacom.c
+++ b/drivers/usb/input/wacom.c
@@ -69,6 +69,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 #include <asm/unaligned.h>
 #include <asm/byteorder.h>
 
@@ -823,10 +824,7 @@ static int wacom_probe(struct usb_interface *intf, const struct usb_device_id *i
 
 	wacom->dev.name = wacom->features->name;
 	wacom->dev.phys = wacom->phys;
-	wacom->dev.id.bustype = BUS_USB;
-	wacom->dev.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-	wacom->dev.id.product = le16_to_cpu(dev->descriptor.idProduct);
-	wacom->dev.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+	usb_to_input_id(dev, &wacom->dev.id);
 	wacom->dev.dev = &intf->dev;
 	wacom->usbdev = dev;
 
diff --git a/drivers/usb/input/xpad.c b/drivers/usb/input/xpad.c
index a7fa1b17dcfe..18125e0bffa2 100644
--- a/drivers/usb/input/xpad.c
+++ b/drivers/usb/input/xpad.c
@@ -62,6 +62,7 @@
 #include <linux/module.h>
 #include <linux/smp_lock.h>
 #include <linux/usb.h>
+#include <linux/usb_input.h>
 
 #define DRIVER_VERSION "v0.0.5"
 #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>"
@@ -256,10 +257,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id
 
 	xpad->udev = udev;
 
-	xpad->dev.id.bustype = BUS_USB;
-	xpad->dev.id.vendor = le16_to_cpu(udev->descriptor.idVendor);
-	xpad->dev.id.product = le16_to_cpu(udev->descriptor.idProduct);
-	xpad->dev.id.version = le16_to_cpu(udev->descriptor.bcdDevice);
+	usb_to_input_id(udev, &xpad->dev.id);
 	xpad->dev.dev = &intf->dev;
 	xpad->dev.private = xpad;
 	xpad->dev.name = xpad_device[i].name;
diff --git a/drivers/usb/media/konicawc.c b/drivers/usb/media/konicawc.c
index 08521a2b4f3d..20ac9e1069d4 100644
--- a/drivers/usb/media/konicawc.c
+++ b/drivers/usb/media/konicawc.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/input.h>
+#include <linux/usb_input.h>
 
 #include "usbvideo.h"
 
@@ -845,10 +846,7 @@ static int konicawc_probe(struct usb_interface *intf, const struct usb_device_id
 		cam->input.private = cam;
 		cam->input.evbit[0] = BIT(EV_KEY);
 		cam->input.keybit[LONG(BTN_0)] = BIT(BTN_0);
-		cam->input.id.bustype = BUS_USB;
-		cam->input.id.vendor = le16_to_cpu(dev->descriptor.idVendor);
-		cam->input.id.product = le16_to_cpu(dev->descriptor.idProduct);
-		cam->input.id.version = le16_to_cpu(dev->descriptor.bcdDevice);
+		usb_to_input_id(dev, &cam->input.id);
 		input_register_device(&cam->input);
 		
 		usb_make_path(dev, cam->input_physname, 56);
diff --git a/include/linux/usb_input.h b/include/linux/usb_input.h
new file mode 100644
index 000000000000..716e0cc16043
--- /dev/null
+++ b/include/linux/usb_input.h
@@ -0,0 +1,25 @@
+#ifndef __USB_INPUT_H
+#define __USB_INPUT_H
+
+/*
+ * Copyright (C) 2005 Dmitry Torokhov
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/usb.h>
+#include <linux/input.h>
+#include <asm/byteorder.h>
+
+static inline void
+usb_to_input_id(const struct usb_device *dev, struct input_id *id)
+{
+	id->bustype = BUS_USB;
+	id->vendor = le16_to_cpu(dev->descriptor.idVendor);
+	id->product = le16_to_cpu(dev->descriptor.idProduct);
+	id->version = le16_to_cpu(dev->descriptor.bcdDevice);
+}
+
+#endif
-- 
cgit v1.2.3-59-g8ed1b


From 5b6271bda42be8edb77fbd588621cc09199fa7fb Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dtor_core@ameritech.net>
Date: Thu, 30 Jun 2005 00:50:38 -0500
Subject: Input: make name, phys and uniq be 'const char *' because once       
 set noone should attempt to change them.

Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 drivers/char/sonypi.c       | 24 ++----------------------
 drivers/input/misc/uinput.c | 23 ++++++++++++-----------
 include/linux/input.h       |  6 +++---
 3 files changed, 17 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 983915bf87f1..cefbe985e55c 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -1228,14 +1228,7 @@ static int __devinit sonypi_probe(void)
 		sonypi_device.input_jog_dev.keybit[LONG(BTN_MOUSE)] =
 			BIT(BTN_MIDDLE);
 		sonypi_device.input_jog_dev.relbit[0] = BIT(REL_WHEEL);
-		sonypi_device.input_jog_dev.name =
-			kmalloc(sizeof(SONYPI_JOG_INPUTNAME), GFP_KERNEL);
-		if (!sonypi_device.input_jog_dev.name) {
-			printk(KERN_ERR "sonypi: kmalloc failed\n");
-			ret = -ENOMEM;
-			goto out_inkmallocinput1;
-		}
-		sprintf(sonypi_device.input_jog_dev.name, SONYPI_JOG_INPUTNAME);
+		sonypi_device.input_jog_dev.name = SONYPI_JOG_INPUTNAME;
 		sonypi_device.input_jog_dev.id.bustype = BUS_ISA;
 		sonypi_device.input_jog_dev.id.vendor = PCI_VENDOR_ID_SONY;
 
@@ -1249,14 +1242,7 @@ static int __devinit sonypi_probe(void)
 			if (sonypi_inputkeys[i].inputev)
 				set_bit(sonypi_inputkeys[i].inputev,
 					sonypi_device.input_key_dev.keybit);
-		sonypi_device.input_key_dev.name =
-			kmalloc(sizeof(SONYPI_KEY_INPUTNAME), GFP_KERNEL);
-		if (!sonypi_device.input_key_dev.name) {
-			printk(KERN_ERR "sonypi: kmalloc failed\n");
-			ret = -ENOMEM;
-			goto out_inkmallocinput2;
-		}
-		sprintf(sonypi_device.input_key_dev.name, SONYPI_KEY_INPUTNAME);
+		sonypi_device.input_key_dev.name = SONYPI_KEY_INPUTNAME;
 		sonypi_device.input_key_dev.id.bustype = BUS_ISA;
 		sonypi_device.input_key_dev.id.vendor = PCI_VENDOR_ID_SONY;
 
@@ -1314,11 +1300,7 @@ out_platformdev:
 	kfifo_free(sonypi_device.input_fifo);
 out_infifo:
 	input_unregister_device(&sonypi_device.input_key_dev);
-	kfree(sonypi_device.input_key_dev.name);
-out_inkmallocinput2:
 	input_unregister_device(&sonypi_device.input_jog_dev);
-	kfree(sonypi_device.input_jog_dev.name);
-out_inkmallocinput1:
 	free_irq(sonypi_device.irq, sonypi_irq);
 out_reqirq:
 	release_region(sonypi_device.ioport1, sonypi_device.region_size);
@@ -1345,9 +1327,7 @@ static void __devexit sonypi_remove(void)
 
 	if (useinput) {
 		input_unregister_device(&sonypi_device.input_key_dev);
-		kfree(sonypi_device.input_key_dev.name);
 		input_unregister_device(&sonypi_device.input_jog_dev);
-		kfree(sonypi_device.input_jog_dev.name);
 		kfifo_free(sonypi_device.input_fifo);
 	}
 
diff --git a/drivers/input/misc/uinput.c b/drivers/input/misc/uinput.c
index c3eebf593ab6..d5c5b32045af 100644
--- a/drivers/input/misc/uinput.c
+++ b/drivers/input/misc/uinput.c
@@ -251,6 +251,7 @@ static int uinput_alloc_device(struct file *file, const char __user *buffer, siz
 	struct uinput_user_dev	*user_dev;
 	struct input_dev	*dev;
 	struct uinput_device	*udev;
+	char			*name;
 	int			size;
 	int			retval;
 
@@ -274,13 +275,13 @@ static int uinput_alloc_device(struct file *file, const char __user *buffer, siz
 		kfree(dev->name);
 
 	size = strnlen(user_dev->name, UINPUT_MAX_NAME_SIZE) + 1;
-	dev->name = kmalloc(size, GFP_KERNEL);
-	if (!dev->name) {
+	dev->name = name = kmalloc(size, GFP_KERNEL);
+	if (!name) {
 		retval = -ENOMEM;
 		goto exit;
 	}
+	strlcpy(name, user_dev->name, size);
 
-	strlcpy(dev->name, user_dev->name, size);
 	dev->id.bustype	= user_dev->id.bustype;
 	dev->id.vendor	= user_dev->id.vendor;
 	dev->id.product	= user_dev->id.product;
@@ -397,6 +398,7 @@ static int uinput_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 	struct uinput_ff_erase  ff_erase;
 	struct uinput_request   *req;
 	int                     length;
+	char			*phys;
 
 	udev = file->private_data;
 
@@ -494,20 +496,19 @@ static int uinput_ioctl(struct inode *inode, struct file *file, unsigned int cmd
 				retval = -EFAULT;
 				break;
 			}
-			if (NULL != udev->dev->phys)
-				kfree(udev->dev->phys);
-			udev->dev->phys = kmalloc(length, GFP_KERNEL);
-			if (!udev->dev->phys) {
+			kfree(udev->dev->phys);
+			udev->dev->phys = phys = kmalloc(length, GFP_KERNEL);
+			if (!phys) {
 				retval = -ENOMEM;
 				break;
 			}
-			if (copy_from_user(udev->dev->phys, p, length)) {
-				retval = -EFAULT;
-				kfree(udev->dev->phys);
+			if (copy_from_user(phys, p, length)) {
 				udev->dev->phys = NULL;
+				kfree(phys);
+				retval = -EFAULT;
 				break;
 			}
-			udev->dev->phys[length - 1] = '\0';
+			phys[length - 1] = '\0';
 			break;
 
 		case UI_BEGIN_FF_UPLOAD:
diff --git a/include/linux/input.h b/include/linux/input.h
index b9cc0ac71f44..bdc53c6cc962 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -811,9 +811,9 @@ struct input_dev {
 
 	void *private;
 
-	char *name;
-	char *phys;
-	char *uniq;
+	const char *name;
+	const char *phys;
+	const char *uniq;
 	struct input_id id;
 
 	unsigned long evbit[NBITS(EV_MAX)];
-- 
cgit v1.2.3-59-g8ed1b


From a03fa955576af50df80bec9127b46ef57e0877c0 Mon Sep 17 00:00:00 2001
From: "rajesh.shah@intel.com" <rajesh.shah@intel.com>
Date: Thu, 2 Jun 2005 15:41:48 -0700
Subject: [PATCH] PCI: Increase the number of PCI bus resources

This patch increases the number of resource pointers in the
pci_bus structure. This is needed to store >4 resource ranges
for host bridges and transparent PCI bridges. With this change,
all PCI buses will have more resource pointers, but most PCI
buses will only use the first 3 or 4, the remaining being NULL.
The PCI core already deals with this correctly.

Signed-off-by: Rajesh Shah <rajesh.shah@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 66798b46f308..a46cabfd08c8 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -586,7 +586,7 @@ struct pci_dev {
 #define PCI_NUM_RESOURCES 11
 
 #ifndef PCI_BUS_NUM_RESOURCES
-#define PCI_BUS_NUM_RESOURCES 4
+#define PCI_BUS_NUM_RESOURCES 8
 #endif
   
 #define PCI_REGION_FLAG_MASK 0x0fU	/* These bits of resource flags tell us the PCI region flags */
-- 
cgit v1.2.3-59-g8ed1b


From 75865858971add95809c5c9cd35dc4cfba08e33b Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@suse.de>
Date: Thu, 30 Jun 2005 02:18:12 -0700
Subject: [PATCH] PCI: clean up dynamic pci id logic

The dynamic pci id logic has been bothering me for a while, and now that
I started to look into how to move some of this to the driver core, I
thought it was time to clean it all up.

It ends up making the code smaller, and easier to follow, and fixes a
few bugs at the same time (dynamic ids were not being matched
everywhere, and so could be missed on some call paths for new devices,
semaphore not needed to be grabbed when adding a new id and calling the
driver core, etc.)

I also renamed the function pci_match_device() to pci_match_id() as
that's what it really does.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/kernel/cpu/cpufreq/gx-suspmod.c |   2 +-
 drivers/char/hw_random.c                  |   2 +-
 drivers/char/watchdog/i8xx_tco.c          |   2 +-
 drivers/ide/setup-pci.c                   |   2 +-
 drivers/parport/parport_pc.c              |   2 +-
 drivers/pci/pci-driver.c                  | 196 +++++++++++-------------------
 include/linux/pci-dynids.h                |  18 ---
 include/linux/pci.h                       |   3 +-
 sound/pci/bt87x.c                         |   2 +-
 9 files changed, 79 insertions(+), 150 deletions(-)
 delete mode 100644 include/linux/pci-dynids.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
index 1a49adb1f4a6..e86ea486c311 100644
--- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c
@@ -190,7 +190,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
 
 	/* detect which companion chip is used */
 	while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
-		if ((pci_match_device (gx_chipset_tbl, gx_pci)) != NULL) {
+		if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) {
 			return gx_pci;
 		}
 	}
diff --git a/drivers/char/hw_random.c b/drivers/char/hw_random.c
index 7e6ac14c2450..3480535a09c5 100644
--- a/drivers/char/hw_random.c
+++ b/drivers/char/hw_random.c
@@ -579,7 +579,7 @@ static int __init rng_init (void)
 
 	/* Probe for Intel, AMD RNGs */
 	for_each_pci_dev(pdev) {
-		ent = pci_match_device (rng_pci_tbl, pdev);
+		ent = pci_match_id(rng_pci_tbl, pdev);
 		if (ent) {
 			rng_ops = &rng_vendor_ops[ent->driver_data];
 			goto match;
diff --git a/drivers/char/watchdog/i8xx_tco.c b/drivers/char/watchdog/i8xx_tco.c
index b14d642439ed..5d07ee59679d 100644
--- a/drivers/char/watchdog/i8xx_tco.c
+++ b/drivers/char/watchdog/i8xx_tco.c
@@ -401,7 +401,7 @@ static unsigned char __init i8xx_tco_getdevice (void)
 	 */
 
 	while ((dev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-		if (pci_match_device(i8xx_tco_pci_tbl, dev)) {
+		if (pci_match_id(i8xx_tco_pci_tbl, dev)) {
 			i8xx_tco_pci = dev;
 			break;
 		}
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index e501675ad72e..77da827b2898 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -847,7 +847,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
 		d = list_entry(l, struct pci_driver, node);
 		if(d->id_table)
 		{
-			const struct pci_device_id *id = pci_match_device(d->id_table, dev);
+			const struct pci_device_id *id = pci_match_id(d->id_table, dev);
 			if(id != NULL)
 			{
 				if(d->probe(dev, id) >= 0)
diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 80edfa3abd29..4598c6a9212d 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -3008,7 +3008,7 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 	int ret = 0;
 
 	while ((pdev = pci_find_device(PCI_ANY_ID, PCI_ANY_ID, pdev)) != NULL) {
-		id = pci_match_device (parport_pc_pci_tbl, pdev);
+		id = pci_match_id(parport_pc_pci_tbl, pdev);
 		if (id == NULL || id->driver_data >= last_sio)
 			continue;
 
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index e65bf2b395aa..aac6de9568e5 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -7,7 +7,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/pci-dynids.h>
 #include "pci.h"
 
 /*
@@ -19,35 +18,11 @@
  */
 
 #ifdef CONFIG_HOTPLUG
-/**
- * pci_device_probe_dynamic()
- *
- * Walk the dynamic ID list looking for a match.
- * returns 0 and sets pci_dev->driver when drv claims pci_dev, else error.
- */
-static int
-pci_device_probe_dynamic(struct pci_driver *drv, struct pci_dev *pci_dev)
-{
-	int error = -ENODEV;
-	struct list_head *pos;
-	struct dynid *dynid;
 
-	spin_lock(&drv->dynids.lock);
-	list_for_each(pos, &drv->dynids.list) {
-		dynid = list_entry(pos, struct dynid, node);
-		if (pci_match_one_device(&dynid->id, pci_dev)) {
-			spin_unlock(&drv->dynids.lock);
-			error = drv->probe(pci_dev, &dynid->id);
-			if (error >= 0) {
-				pci_dev->driver = drv;
-				return 0;
-			}
-			return error;
-		}
-	}
-	spin_unlock(&drv->dynids.lock);
-	return error;
-}
+struct pci_dynid {
+	struct list_head node;
+	struct pci_device_id id;
+};
 
 /**
  * store_new_id
@@ -58,8 +33,7 @@ pci_device_probe_dynamic(struct pci_driver *drv, struct pci_dev *pci_dev)
 static inline ssize_t
 store_new_id(struct device_driver *driver, const char *buf, size_t count)
 {
-	struct dynid *dynid;
-	struct bus_type * bus;
+	struct pci_dynid *dynid;
 	struct pci_driver *pdrv = to_pci_driver(driver);
 	__u32 vendor=PCI_ANY_ID, device=PCI_ANY_ID, subvendor=PCI_ANY_ID,
 		subdevice=PCI_ANY_ID, class=0, class_mask=0;
@@ -91,37 +65,22 @@ store_new_id(struct device_driver *driver, const char *buf, size_t count)
 	list_add_tail(&pdrv->dynids.list, &dynid->node);
 	spin_unlock(&pdrv->dynids.lock);
 
-	bus = get_bus(pdrv->driver.bus);
-	if (bus) {
-		if (get_driver(&pdrv->driver)) {
-			down_write(&bus->subsys.rwsem);
-			driver_attach(&pdrv->driver);
-			up_write(&bus->subsys.rwsem);
-			put_driver(&pdrv->driver);
-		}
-		put_bus(bus);
+	if (get_driver(&pdrv->driver)) {
+		driver_attach(&pdrv->driver);
+		put_driver(&pdrv->driver);
 	}
 
 	return count;
 }
-
 static DRIVER_ATTR(new_id, S_IWUSR, NULL, store_new_id);
-static inline void
-pci_init_dynids(struct pci_dynids *dynids)
-{
-	spin_lock_init(&dynids->lock);
-	INIT_LIST_HEAD(&dynids->list);
-}
 
 static void
 pci_free_dynids(struct pci_driver *drv)
 {
-	struct list_head *pos, *n;
-	struct dynid *dynid;
+	struct pci_dynid *dynid, *n;
 
 	spin_lock(&drv->dynids.lock);
-	list_for_each_safe(pos, n, &drv->dynids.list) {
-		dynid = list_entry(pos, struct dynid, node);
+	list_for_each_entry_safe(dynid, n, &drv->dynids.list, node) {
 		list_del(&dynid->node);
 		kfree(dynid);
 	}
@@ -138,83 +97,70 @@ pci_create_newid_file(struct pci_driver *drv)
 	return error;
 }
 
-static int
-pci_bus_match_dynids(const struct pci_dev *pci_dev, struct pci_driver *pci_drv)
-{
-	struct list_head *pos;
-	struct dynid *dynid;
-
-	spin_lock(&pci_drv->dynids.lock);
-	list_for_each(pos, &pci_drv->dynids.list) {
-		dynid = list_entry(pos, struct dynid, node);
-		if (pci_match_one_device(&dynid->id, pci_dev)) {
-			spin_unlock(&pci_drv->dynids.lock);
-			return 1;
-		}
-	}
-	spin_unlock(&pci_drv->dynids.lock);
-	return 0;
-}
-
 #else /* !CONFIG_HOTPLUG */
-static inline int pci_device_probe_dynamic(struct pci_driver *drv, struct pci_dev *pci_dev)
-{
-	return -ENODEV;
-}
-static inline void pci_init_dynids(struct pci_dynids *dynids) {}
 static inline void pci_free_dynids(struct pci_driver *drv) {}
 static inline int pci_create_newid_file(struct pci_driver *drv)
 {
 	return 0;
 }
-static inline int pci_bus_match_dynids(const struct pci_dev *pci_dev, struct pci_driver *pci_drv)
-{
-	return 0;
-}
 #endif
 
 /**
- * pci_match_device - Tell if a PCI device structure has a matching
- *                    PCI device id structure
+ * pci_match_id - See if a pci device matches a given pci_id table
  * @ids: array of PCI device id structures to search in
- * @dev: the PCI device structure to match against
- * 
+ * @dev: the PCI device structure to match against.
+ *
  * Used by a driver to check whether a PCI device present in the
- * system is in its list of supported devices.Returns the matching
+ * system is in its list of supported devices.  Returns the matching
  * pci_device_id structure or %NULL if there is no match.
+ *
+ * Depreciated, don't use this as it will not catch any dynamic ids
+ * that a driver might want to check for.
  */
-const struct pci_device_id *
-pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev)
+const struct pci_device_id *pci_match_id(const struct pci_device_id *ids,
+					 struct pci_dev *dev)
 {
-	while (ids->vendor || ids->subvendor || ids->class_mask) {
-		if (pci_match_one_device(ids, dev))
-			return ids;
-		ids++;
+	if (ids) {
+		while (ids->vendor || ids->subvendor || ids->class_mask) {
+			if (pci_match_one_device(ids, dev))
+				return ids;
+			ids++;
+		}
 	}
 	return NULL;
 }
 
 /**
- * pci_device_probe_static()
- * 
- * returns 0 and sets pci_dev->driver when drv claims pci_dev, else error.
+ * pci_match_device - Tell if a PCI device structure has a matching
+ *                    PCI device id structure
+ * @ids: array of PCI device id structures to search in
+ * @dev: the PCI device structure to match against
+ * @drv: the PCI driver to match against
+ *
+ * Used by a driver to check whether a PCI device present in the
+ * system is in its list of supported devices.  Returns the matching
+ * pci_device_id structure or %NULL if there is no match.
  */
-static int
-pci_device_probe_static(struct pci_driver *drv, struct pci_dev *pci_dev)
-{		   
-	int error = -ENODEV;
+const struct pci_device_id *pci_match_device(struct pci_driver *drv,
+					     struct pci_dev *dev)
+{
 	const struct pci_device_id *id;
+	struct pci_dynid *dynid;
 
-	if (!drv->id_table)
-		return error;
-	id = pci_match_device(drv->id_table, pci_dev);
+	id = pci_match_id(drv->id_table, dev);
 	if (id)
-		error = drv->probe(pci_dev, id);
-	if (error >= 0) {
-		pci_dev->driver = drv;
-		error = 0;
+		return id;
+
+	/* static ids didn't match, lets look at the dynamic ones */
+	spin_lock(&drv->dynids.lock);
+	list_for_each_entry(dynid, &drv->dynids.list, node) {
+		if (pci_match_one_device(&dynid->id, dev)) {
+			spin_unlock(&drv->dynids.lock);
+			return &dynid->id;
+		}
 	}
-	return error;
+	spin_unlock(&drv->dynids.lock);
+	return NULL;
 }
 
 /**
@@ -225,13 +171,20 @@ pci_device_probe_static(struct pci_driver *drv, struct pci_dev *pci_dev)
  */
 static int
 __pci_device_probe(struct pci_driver *drv, struct pci_dev *pci_dev)
-{		   
+{
+	const struct pci_device_id *id;
 	int error = 0;
 
 	if (!pci_dev->driver && drv->probe) {
-		error = pci_device_probe_static(drv, pci_dev);
-		if (error == -ENODEV)
-			error = pci_device_probe_dynamic(drv, pci_dev);
+		error = -ENODEV;
+
+		id = pci_match_device(drv, pci_dev);
+		if (id)
+			error = drv->probe(pci_dev, id);
+		if (error >= 0) {
+			pci_dev->driver = drv;
+			error = 0;
+		}
 	}
 	return error;
 }
@@ -371,12 +324,6 @@ static struct kobj_type pci_driver_kobj_type = {
 	.sysfs_ops = &pci_driver_sysfs_ops,
 };
 
-static int
-pci_populate_driver_dir(struct pci_driver *drv)
-{
-	return pci_create_newid_file(drv);
-}
-
 /**
  * pci_register_driver - register a new pci driver
  * @drv: the driver structure to register
@@ -401,13 +348,15 @@ int pci_register_driver(struct pci_driver *drv)
 		drv->driver.shutdown = pci_device_shutdown;
 	drv->driver.owner = drv->owner;
 	drv->driver.kobj.ktype = &pci_driver_kobj_type;
-	pci_init_dynids(&drv->dynids);
+
+	spin_lock_init(&drv->dynids.lock);
+	INIT_LIST_HEAD(&drv->dynids.list);
 
 	/* register with core */
 	error = driver_register(&drv->driver);
 
 	if (!error)
-		pci_populate_driver_dir(drv);
+		error = pci_create_newid_file(drv);
 
 	return error;
 }
@@ -463,21 +412,17 @@ pci_dev_driver(const struct pci_dev *dev)
  * system is in its list of supported devices.Returns the matching
  * pci_device_id structure or %NULL if there is no match.
  */
-static int pci_bus_match(struct device * dev, struct device_driver * drv) 
+static int pci_bus_match(struct device *dev, struct device_driver *drv)
 {
-	const struct pci_dev * pci_dev = to_pci_dev(dev);
-	struct pci_driver * pci_drv = to_pci_driver(drv);
-	const struct pci_device_id * ids = pci_drv->id_table;
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct pci_driver *pci_drv = to_pci_driver(drv);
 	const struct pci_device_id *found_id;
 
-	if (!ids)
-		return 0;
-
-	found_id = pci_match_device(ids, pci_dev);
+	found_id = pci_match_device(pci_drv, pci_dev);
 	if (found_id)
 		return 1;
 
-	return pci_bus_match_dynids(pci_dev, pci_drv);
+	return 0;
 }
 
 /**
@@ -536,6 +481,7 @@ static int __init pci_driver_init(void)
 
 postcore_initcall(pci_driver_init);
 
+EXPORT_SYMBOL(pci_match_id);
 EXPORT_SYMBOL(pci_match_device);
 EXPORT_SYMBOL(pci_register_driver);
 EXPORT_SYMBOL(pci_unregister_driver);
diff --git a/include/linux/pci-dynids.h b/include/linux/pci-dynids.h
deleted file mode 100644
index 183b6b0de81c..000000000000
--- a/include/linux/pci-dynids.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- *	PCI defines and function prototypes
- *	Copyright 2003 Dell Inc.
- *        by Matt Domsch <Matt_Domsch@dell.com>
- */
-
-#ifndef LINUX_PCI_DYNIDS_H
-#define LINUX_PCI_DYNIDS_H
-
-#include <linux/list.h>
-#include <linux/mod_devicetable.h>
-
-struct dynid {
-	struct list_head        node;
-	struct pci_device_id    id;
-};
-
-#endif
diff --git a/include/linux/pci.h b/include/linux/pci.h
index a46cabfd08c8..7ac14961ba22 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -860,7 +860,8 @@ int pci_register_driver(struct pci_driver *);
 void pci_unregister_driver(struct pci_driver *);
 void pci_remove_behind_bridge(struct pci_dev *);
 struct pci_driver *pci_dev_driver(const struct pci_dev *);
-const struct pci_device_id *pci_match_device(const struct pci_device_id *ids, const struct pci_dev *dev);
+const struct pci_device_id *pci_match_device(struct pci_driver *drv, struct pci_dev *dev);
+const struct pci_device_id *pci_match_id(const struct pci_device_id *ids, struct pci_dev *dev);
 int pci_scan_bridge(struct pci_bus *bus, struct pci_dev * dev, int max, int pass);
 
 /* kmem_cache style wrapper around pci_alloc_consistent() */
diff --git a/sound/pci/bt87x.c b/sound/pci/bt87x.c
index defdc5a459f0..909fef8903cb 100644
--- a/sound/pci/bt87x.c
+++ b/sound/pci/bt87x.c
@@ -804,7 +804,7 @@ static int __devinit snd_bt87x_detect_card(struct pci_dev *pci)
 	int i;
 	const struct pci_device_id *supported;
 
-	supported = pci_match_device(snd_bt87x_ids, pci);
+	supported = pci_match_device(driver, pci);
 	if (supported)
 		return supported->driver_data;
 
-- 
cgit v1.2.3-59-g8ed1b


From 21e2c01dc3e38d466eda5871645878d2c3a33261 Mon Sep 17 00:00:00 2001
From: Rob Punkunus <rpunkunus@nvidia.com>
Date: Sun, 3 Jul 2005 17:37:18 +0200
Subject: [PATCH] amd74xx: support MCP55 device IDs

From: Rob Punkunus <rpunkunus@nvidia.com>

Rob Punkunus recently submitted a patch to enable support for MCP51/MCP55 in
the amd74xx driver. This patch was whitespace-corrupted and didn't apply to
2.6.12 since MCP51 support was merged in the 2.6.12-rc series.

Gentoo would like to support this hardware for our upcoming release media, so
I fixed the patch, and here it is :)

Signed-off-by: Daniel Drake <dsd@gentoo.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@elka.pw.edu.pl>
---
 drivers/ide/pci/amd74xx.c | 3 +++
 include/linux/pci_ids.h   | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/amd74xx.c b/drivers/ide/pci/amd74xx.c
index 65eab9b63a79..844a6c9fb949 100644
--- a/drivers/ide/pci/amd74xx.c
+++ b/drivers/ide/pci/amd74xx.c
@@ -73,6 +73,7 @@ static struct amd_ide_chip {
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,	0x50, AMD_UDMA_133 },
 	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	0x50, AMD_UDMA_133 },
+	{ PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	0x50, AMD_UDMA_133 },
 	{ 0 }
 };
 
@@ -489,6 +490,7 @@ static ide_pci_device_t amd74xx_chipsets[] __devinitdata = {
 	/* 13 */ DECLARE_NV_DEV("NFORCE-CK804"),
 	/* 14 */ DECLARE_NV_DEV("NFORCE-MCP04"),
 	/* 15 */ DECLARE_NV_DEV("NFORCE-MCP51"),
+	/* 16 */ DECLARE_NV_DEV("NFORCE-MCP55"),
 };
 
 static int __devinit amd74xx_probe(struct pci_dev *dev, const struct pci_device_id *id)
@@ -524,6 +526,7 @@ static struct pci_device_id amd74xx_pci_tbl[] = {
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_CK804_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 13 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP04_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 14 },
 	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 15 },
+	{ PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE,	PCI_ANY_ID, PCI_ANY_ID, 0, 0, 16 },
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, amd74xx_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index c3ee1ae4545a..27348c22dacb 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1238,6 +1238,7 @@
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_IDE	0x0265
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA	0x0266
 #define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP51_SATA2	0x0267
+#define PCI_DEVICE_ID_NVIDIA_NFORCE_MCP55_IDE	0x036E
 #define PCI_DEVICE_ID_NVIDIA_NVENET_12		0x0268
 #define PCI_DEVICE_ID_NVIDIA_NVENET_13		0x0269
 #define PCI_DEVICE_ID_NVIDIA_MCP51_AUDIO	0x026B
-- 
cgit v1.2.3-59-g8ed1b


From e7270dec080002d8aa18256c756af6c32331ef48 Mon Sep 17 00:00:00 2001
From: Raphael Assenat <raph@raphnet.net>
Date: Mon, 4 Jul 2005 13:23:45 -0700
Subject: [SPARC64/COMPAT]: Add some compat ioctl for ppdev

The following patch adds some ioctls to include/linux/compat_ioctl.h
to allow using ppdev from the 32 bit user space on sparc64.

This patch also adds the PPDEV option in the sparc64 menu, near Parallel
printer support in the 'General machine setup' submenu.

All those ioctls seem to be compatible, since (correct me if I'm wrong)
they dont use the 'long' type. See include/linux/ppdev.h.

The application I used to test the new ioctls only used the following:
PPEXCL
PPCLAIM
PPNEGOT
PPGETMODES
PPRCONTROL
PPWCONTROL
PPDATADIR
PPWDATA
PPRDATA

But I beleive that the other ioctls will work fine.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/Kconfig         | 18 ++++++++++++++++++
 include/linux/compat_ioctl.h | 19 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index e2b050eb3b96..d78bc13ebbb9 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -444,6 +444,24 @@ config PRINTER
 	  If you have more than 8 printers, you need to increase the LP_NO
 	  macro in lp.c and the PARPORT_MAX macro in parport.h.
 
+config PPDEV
+	tristate "Support for user-space parallel port device drivers"
+	depends on PARPORT
+	---help---
+	  Saying Y to this adds support for /dev/parport device nodes.  This
+	  is needed for programs that want portable access to the parallel
+	  port, for instance deviceid (which displays Plug-and-Play device
+	  IDs).
+
+	  This is the parallel port equivalent of SCSI generic support (sg).
+	  It is safe to say N to this -- it is not needed for normal printing
+	  or parallel port CD-ROM/disk support.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called ppdev.
+
+	  If unsure, say N.
+
 config ENVCTRL
 	tristate "SUNW, envctrl support"
 	depends on PCI
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index 70a4ebb5d964..ecb0d39c0798 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -346,10 +346,27 @@ COMPATIBLE_IOCTL(PPPOEIOCDFWD)
 /* LP */
 COMPATIBLE_IOCTL(LPGETSTATUS)
 /* ppdev */
+COMPATIBLE_IOCTL(PPSETMODE)
+COMPATIBLE_IOCTL(PPRSTATUS)
+COMPATIBLE_IOCTL(PPRCONTROL)
+COMPATIBLE_IOCTL(PPWCONTROL)
+COMPATIBLE_IOCTL(PPFCONTROL)
+COMPATIBLE_IOCTL(PPRDATA)
+COMPATIBLE_IOCTL(PPWDATA)
 COMPATIBLE_IOCTL(PPCLAIM)
 COMPATIBLE_IOCTL(PPRELEASE)
-COMPATIBLE_IOCTL(PPEXCL)
 COMPATIBLE_IOCTL(PPYIELD)
+COMPATIBLE_IOCTL(PPEXCL)
+COMPATIBLE_IOCTL(PPDATADIR)
+COMPATIBLE_IOCTL(PPNEGOT)
+COMPATIBLE_IOCTL(PPWCTLONIRQ)
+COMPATIBLE_IOCTL(PPCLRIRQ)
+COMPATIBLE_IOCTL(PPSETPHASE)
+COMPATIBLE_IOCTL(PPGETMODES)
+COMPATIBLE_IOCTL(PPGETMODE)
+COMPATIBLE_IOCTL(PPGETPHASE)
+COMPATIBLE_IOCTL(PPGETFLAGS)
+COMPATIBLE_IOCTL(PPSETFLAGS)
 /* CDROM stuff */
 COMPATIBLE_IOCTL(CDROMPAUSE)
 COMPATIBLE_IOCTL(CDROMRESUME)
-- 
cgit v1.2.3-59-g8ed1b


From 55820ee2f8c767a2833b21bd365e5753f50bd8ce Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Tue, 5 Jul 2005 14:08:10 -0700
Subject: [NET]: Fix signedness issues in net/core/filter.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is the code to load packet data into a register:

                        k = fentry->k;
                        if (k < 0) {
...
                        } else {
                                u32 _tmp, *p;
                                p = skb_header_pointer(skb, k, 4, &_tmp);
                                if (p != NULL) {
                                        A = ntohl(*p);
                                        continue;
                                }
                        }

skb_header_pointer checks if the requested data is within the
linear area:

        int hlen = skb_headlen(skb);

        if (offset + len <= hlen)
                return skb->data + offset;

When offset is within [INT_MAX-len+1..INT_MAX] the addition will
result in a negative number which is <= hlen.

I couldn't trigger a crash on my AMD64 with 2GB of memory, but a
coworker tried on his x86 machine and it crashed immediately.

This patch fixes the check in skb_header_pointer to handle large
positive offsets similar to skb_copy_bits. Invalid data can still
be accessed using negative offsets (also similar to skb_copy_bits),
anyone using negative offsets needs to verify them himself.

Thanks to Thomas V�gtle <thomas.voegtle@coreworks.de> for verifying the
problem by crashing his machine and providing me with an Oops.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 416a2e4024b2..fbcb18651970 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1211,7 +1211,7 @@ static inline void *skb_header_pointer(const struct sk_buff *skb, int offset,
 {
 	int hlen = skb_headlen(skb);
 
-	if (offset + len <= hlen)
+	if (hlen - offset >= len)
 		return skb->data + offset;
 
 	if (skb_copy_bits(skb, offset, buffer, len) < 0)
-- 
cgit v1.2.3-59-g8ed1b


From e176fe8954a5239c24afe79b1001ba3c29511963 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 14:12:44 -0700
Subject: [NET]: Remove unused security member in sk_buff

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h               | 4 +---
 include/linux/tc_ematch/tc_em_meta.h | 2 +-
 net/core/skbuff.c                    | 2 --
 net/ipv4/ip_output.c                 | 1 -
 net/ipv6/ip6_output.c                | 1 -
 net/sched/em_meta.c                  | 6 ------
 6 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fbcb18651970..1e6290f4f81e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -183,7 +183,6 @@ struct skb_shared_info {
  *	@priority: Packet queueing priority
  *	@users: User count - see {datagram,tcp}.c
  *	@protocol: Packet protocol from driver
- *	@security: Security level of packet
  *	@truesize: Buffer size 
  *	@head: Head of buffer
  *	@data: Data head pointer
@@ -255,8 +254,7 @@ struct sk_buff {
 				pkt_type,
 				ip_summed;
 	__u32			priority;
-	unsigned short		protocol,
-				security;
+	unsigned short		protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index a6b2cc530af5..bcb762d93123 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -45,7 +45,7 @@ enum
 	TCF_META_ID_REALDEV,
 	TCF_META_ID_PRIORITY,
 	TCF_META_ID_PROTOCOL,
-	TCF_META_ID_SECURITY,
+	TCF_META_ID_SECURITY, /* obsolete */
 	TCF_META_ID_PKTTYPE,
 	TCF_META_ID_PKTLEN,
 	TCF_META_ID_DATALEN,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bb73b2190ec7..733deee24b9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -357,7 +357,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
 	C(ip_summed);
 	C(priority);
 	C(protocol);
-	C(security);
 	n->destructor = NULL;
 #ifdef CONFIG_NETFILTER
 	C(nfmark);
@@ -422,7 +421,6 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->pkt_type	= old->pkt_type;
 	new->stamp	= old->stamp;
 	new->destructor = NULL;
-	new->security	= old->security;
 #ifdef CONFIG_NETFILTER
 	new->nfmark	= old->nfmark;
 	new->nfcache	= old->nfcache;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6ce5c3292f9f..1bfa49eda96f 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -389,7 +389,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	to->security = from->security;
 	dst_release(to->dst);
 	to->dst = dst_clone(from->dst);
 	to->dev = from->dev;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 06e7cdaeedc5..1f2c2f9e353f 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -465,7 +465,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->pkt_type = from->pkt_type;
 	to->priority = from->priority;
 	to->protocol = from->protocol;
-	to->security = from->security;
 	dst_release(to->dst);
 	to->dst = dst_clone(from->dst);
 	to->dev = from->dev;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 48bb23c2a35a..53d98f8d3d80 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -205,11 +205,6 @@ META_COLLECTOR(int_protocol)
 	dst->value = skb->protocol;
 }
 
-META_COLLECTOR(int_security)
-{
-	dst->value = skb->security;
-}
-
 META_COLLECTOR(int_pkttype)
 {
 	dst->value = skb->pkt_type;
@@ -524,7 +519,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(REALDEV)]		= META_FUNC(int_realdev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
-		[META_ID(SECURITY)]		= META_FUNC(int_security),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
 		[META_ID(PKTLEN)]		= META_FUNC(int_pktlen),
 		[META_ID(DATALEN)]		= META_FUNC(int_datalen),
-- 
cgit v1.2.3-59-g8ed1b


From 1cbb3380ef683f742876f48e3739b3df4ea9e168 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Tue, 5 Jul 2005 14:13:41 -0700
Subject: [NET]: Reduce size of sk_buff by 4 bytes

Reduce local_df to a bit field and ip_summed to a 2 bits
field thus saving 13 bits. Move bit fields, packet type,
and protocol into the spare area between the priority
and the destructor. Saves 4 bytes on both, 32bit and
64bit architectures.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1e6290f4f81e..14b950413495 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -248,17 +248,18 @@ struct sk_buff {
 				data_len,
 				mac_len,
 				csum;
-	unsigned char		local_df,
-				cloned:1,
-				nohdr:1,
-				pkt_type,
-				ip_summed;
 	__u32			priority;
-	unsigned short		protocol;
+	__u8			local_df:1,
+				cloned:1,
+				ip_summed:2,
+				nohdr:1;
+				/* 3 bits spare */
+	__u8			pkt_type;
+	__u16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
-        unsigned long		nfmark;
+	unsigned long		nfmark;
 	__u32			nfcache;
 	__u32			nfctinfo;
 	struct nf_conntrack	*nfct;
-- 
cgit v1.2.3-59-g8ed1b


From bc971dee6ece1fd0d431948924becd9c50e7b778 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 Jul 2005 15:03:46 -0700
Subject: [SHAPER]: Switch to spinlocks.

Dave, you were right and the sleeping locks in shaper were
broken. Markus Kanet noticed this and also tested the patch below that
switches locking to spinlocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/shaper.c      | 42 ++++++++++++++++--------------------------
 include/linux/if_shaper.h |  2 +-
 2 files changed, 17 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/shaper.c b/drivers/net/shaper.c
index 20edeb345792..3ad0b6751f6f 100644
--- a/drivers/net/shaper.c
+++ b/drivers/net/shaper.c
@@ -135,10 +135,8 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct shaper *shaper = dev->priv;
  	struct sk_buff *ptr;
-   
-	if (down_trylock(&shaper->sem))
-		return -1;
-
+  
+	spin_lock(&shaper->lock);
  	ptr=shaper->sendq.prev;
  	
  	/*
@@ -232,7 +230,7 @@ static int shaper_start_xmit(struct sk_buff *skb, struct net_device *dev)
                 shaper->stats.collisions++;
  	}
 	shaper_kick(shaper);
-	up(&shaper->sem);
+	spin_unlock(&shaper->lock);
  	return 0;
 }
 
@@ -271,11 +269,9 @@ static void shaper_timer(unsigned long data)
 {
 	struct shaper *shaper = (struct shaper *)data;
 
-	if (!down_trylock(&shaper->sem)) {
-		shaper_kick(shaper);
-		up(&shaper->sem);
-	} else
-		mod_timer(&shaper->timer, jiffies);
+	spin_lock(&shaper->lock);
+	shaper_kick(shaper);
+	spin_unlock(&shaper->lock);
 }
 
 /*
@@ -331,21 +327,6 @@ static void shaper_kick(struct shaper *shaper)
 }
 
 
-/*
- *	Flush the shaper queues on a closedown
- */
- 
-static void shaper_flush(struct shaper *shaper)
-{
-	struct sk_buff *skb;
-
-	down(&shaper->sem);
-	while((skb=skb_dequeue(&shaper->sendq))!=NULL)
-		dev_kfree_skb(skb);
-	shaper_kick(shaper);
-	up(&shaper->sem);
-}
-
 /*
  *	Bring the interface up. We just disallow this until a 
  *	bind.
@@ -375,7 +356,15 @@ static int shaper_open(struct net_device *dev)
 static int shaper_close(struct net_device *dev)
 {
 	struct shaper *shaper=dev->priv;
-	shaper_flush(shaper);
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&shaper->sendq)) != NULL)
+		dev_kfree_skb(skb);
+
+	spin_lock_bh(&shaper->lock);
+	shaper_kick(shaper);
+	spin_unlock_bh(&shaper->lock);
+
 	del_timer_sync(&shaper->timer);
 	return 0;
 }
@@ -576,6 +565,7 @@ static void shaper_init_priv(struct net_device *dev)
 	init_timer(&sh->timer);
 	sh->timer.function=shaper_timer;
 	sh->timer.data=(unsigned long)sh;
+	spin_lock_init(&sh->lock);
 }
 
 /*
diff --git a/include/linux/if_shaper.h b/include/linux/if_shaper.h
index 004e6f09a6e2..68c896a36a34 100644
--- a/include/linux/if_shaper.h
+++ b/include/linux/if_shaper.h
@@ -23,7 +23,7 @@ struct shaper
 	__u32 shapeclock;
 	unsigned long recovery;	/* Time we can next clock a packet out on
 				   an empty queue */
-	struct semaphore sem;
+	spinlock_t lock;
         struct net_device_stats stats;
 	struct net_device *dev;
 	int  (*hard_start_xmit) (struct sk_buff *skb,
-- 
cgit v1.2.3-59-g8ed1b


From c1b4a7e69576d65efc31a8cea0714173c2841244 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 5 Jul 2005 15:24:38 -0700
Subject: [TCP]: Move to new TSO segmenting scheme.

Make TSO segment transmit size decisions at send time not earlier.

The basic scheme is that we try to build as large a TSO frame as
possible when pulling in the user data, but the size of the TSO frame
output to the card is determined at transmit time.

This is guided by tp->xmit_size_goal.  It is always set to a multiple
of MSS and tells sendmsg/sendpage how large an SKB to try and build.

Later, tcp_write_xmit() and tcp_push_one() chop up the packet if
necessary and conditions warrant.  These routines can also decide to
"defer" in order to wait for more ACKs to arrive and thus allow larger
TSO frames to be emitted.

A general observation is that TSO elongates the pipe, thus requiring a
larger congestion window and larger buffering especially at the sender
side.  Therefore, it is important that applications 1) get a large
enough socket send buffer (this is accomplished by our dynamic send
buffer expansion code) 2) do large enough writes.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h   |   2 +-
 include/net/tcp.h     |   4 +-
 net/ipv4/tcp.c        |  26 ++-
 net/ipv4/tcp_input.c  |  10 +-
 net/ipv4/tcp_ipv4.c   |   2 +-
 net/ipv4/tcp_output.c | 578 +++++++++++++++++++++++++++++++-------------------
 net/ipv6/tcp_ipv6.c   |   2 +-
 7 files changed, 384 insertions(+), 240 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index dfd93d03f5d2..e4fd82e42104 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -286,7 +286,7 @@ struct tcp_sock {
 	__u32	max_window;	/* Maximal window ever seen from peer	*/
 	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
 	__u32	mss_cache;	/* Cached effective mss, not including SACKS */
-	__u16	mss_cache_std;	/* Like mss_cache, but without TSO */
+	__u16	xmit_size_goal;	/* Goal for segmenting output packets	*/
 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
 	__u8	ca_state;	/* State of fast-retransmit machine 	*/
 	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
diff --git a/include/net/tcp.h b/include/net/tcp.h
index b19238027da8..a166918ca56d 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -862,7 +862,7 @@ extern int  tcp_write_wakeup(struct sock *);
 extern void tcp_send_fin(struct sock *sk);
 extern void tcp_send_active_reset(struct sock *sk, int priority);
 extern int  tcp_send_synack(struct sock *);
-extern void tcp_push_one(struct sock *, unsigned mss_now);
+extern void tcp_push_one(struct sock *, unsigned int mss_now);
 extern void tcp_send_ack(struct sock *sk);
 extern void tcp_send_delayed_ack(struct sock *sk);
 
@@ -968,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
 static inline void tcp_initialize_rcv_mss(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int hint = min(tp->advmss, tp->mss_cache_std);
+	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
 
 	hint = min(hint, tp->rcv_wnd/2);
 	hint = min(hint, TCP_MIN_RCVMSS);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2ba73bf3a8f9..29894c749163 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 			 size_t psize, int flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	int mss_now;
+	int mss_now, size_goal;
 	int err;
 	ssize_t copied;
 	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
@@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 	copied = 0;
 
 	err = -EPIPE;
@@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
 		int offset = poffset % PAGE_SIZE;
 		int size = min_t(size_t, psize, PAGE_SIZE - offset);
 
-		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+		if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 new_segment:
 			if (!sk_stream_memory_free(sk))
 				goto wait_for_sndbuf;
@@ -652,7 +653,7 @@ new_segment:
 				goto wait_for_memory;
 
 			skb_entail(sk, tp, skb);
-			copy = mss_now;
+			copy = size_goal;
 		}
 
 		if (copy > size)
@@ -693,7 +694,7 @@ new_segment:
 		if (!(psize -= copy))
 			goto out;
 
-		if (skb->len != mss_now || (flags & MSG_OOB))
+		if (skb->len < mss_now || (flags & MSG_OOB))
 			continue;
 
 		if (forced_push(tp)) {
@@ -713,6 +714,7 @@ wait_for_memory:
 			goto do_error;
 
 		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+		size_goal = tp->xmit_size_goal;
 	}
 
 out:
@@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 
 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 {
-	int tmp = tp->mss_cache_std;
+	int tmp = tp->mss_cache;
 
 	if (sk->sk_route_caps & NETIF_F_SG) {
 		if (sk->sk_route_caps & NETIF_F_TSO)
@@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int iovlen, flags;
-	int mss_now;
+	int mss_now, size_goal;
 	int err, copied;
 	long timeo;
 
@@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
 	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	size_goal = tp->xmit_size_goal;
 
 	/* Ok commence sending. */
 	iovlen = msg->msg_iovlen;
@@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			skb = sk->sk_write_queue.prev;
 
 			if (!sk->sk_send_head ||
-			    (copy = mss_now - skb->len) <= 0) {
+			    (copy = size_goal - skb->len) <= 0) {
 
 new_segment:
 				/* Allocate new segment. If the interface is SG,
@@ -842,7 +845,7 @@ new_segment:
 					skb->ip_summed = CHECKSUM_HW;
 
 				skb_entail(sk, tp, skb);
-				copy = mss_now;
+				copy = size_goal;
 			}
 
 			/* Try to append data to the end of skb. */
@@ -937,7 +940,7 @@ new_segment:
 			if ((seglen -= copy) == 0 && iovlen == 0)
 				goto out;
 
-			if (skb->len != mss_now || (flags & MSG_OOB))
+			if (skb->len < mss_now || (flags & MSG_OOB))
 				continue;
 
 			if (forced_push(tp)) {
@@ -957,6 +960,7 @@ wait_for_memory:
 				goto do_error;
 
 			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+			size_goal = tp->xmit_size_goal;
 		}
 	}
 
@@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
 	info->tcpi_rto = jiffies_to_usecs(tp->rto);
 	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
-	info->tcpi_snd_mss = tp->mss_cache_std;
+	info->tcpi_snd_mss = tp->mss_cache;
 	info->tcpi_rcv_mss = tp->ack.rcv_mss;
 
 	info->tcpi_unacked = tp->packets_out;
@@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 	switch (optname) {
 	case TCP_MAXSEG:
-		val = tp->mss_cache_std;
+		val = tp->mss_cache;
 		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
 			val = tp->rx_opt.user_mss;
 		break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2ef2f355b8b8..8de2f1071c2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
 	if (!cwnd) {
-		if (tp->mss_cache_std > 1460)
+		if (tp->mss_cache > 1460)
 			cwnd = 2;
 		else
-			cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
 	}
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
@@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 	if (sk->sk_route_caps & NETIF_F_TSO) {
 		sk->sk_route_caps &= ~NETIF_F_TSO;
 		sock_set_flag(sk, SOCK_NO_LARGESEND);
-		tp->mss_cache = tp->mss_cache_std;
+		tp->mss_cache = tp->mss_cache;
 	}
 
 	if (!tp->sacked_out)
@@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 			    (IsFack(tp) ||
 			     !before(lost_retrans,
 				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
-				     tp->mss_cache_std))) {
+				     tp->mss_cache))) {
 				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 				tp->retrans_out -= tcp_skb_pcount(skb);
 
@@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_should_expand_sndbuf(sk, tp)) {
- 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+ 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
 			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
 		    demanded = max_t(unsigned int, tp->snd_cwnd,
 						   tp->reordering + 1);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index ebf112347a97..62f62bb05c2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache_std = tp->mss_cache = 536;
+	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
 	tp->ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 0a4cd24b6578..fd3ce38184ae 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -49,7 +49,7 @@ int sysctl_tcp_retrans_collapse = 1;
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
  */
-int sysctl_tcp_tso_win_divisor = 8;
+int sysctl_tcp_tso_win_divisor = 3;
 
 static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
 				    struct sk_buff *skb)
@@ -403,21 +403,11 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 		sk->sk_send_head = skb;
 }
 
-static inline void tcp_tso_set_push(struct sk_buff *skb)
-{
-	/* Force push to be on for any TSO frames to workaround
-	 * problems with busted implementations like Mac OS-X that
-	 * hold off socket receive wakeups until push is seen.
-	 */
-	if (tcp_skb_pcount(skb) > 1)
-		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
-}
-
 static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (skb->len <= tp->mss_cache_std ||
+	if (skb->len <= tp->mss_cache ||
 	    !(sk->sk_route_caps & NETIF_F_TSO)) {
 		/* Avoid the costly divide in the normal
 		 * non-TSO case.
@@ -427,164 +417,10 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb)
 	} else {
 		unsigned int factor;
 
-		factor = skb->len + (tp->mss_cache_std - 1);
-		factor /= tp->mss_cache_std;
+		factor = skb->len + (tp->mss_cache - 1);
+		factor /= tp->mss_cache;
 		skb_shinfo(skb)->tso_segs = factor;
-		skb_shinfo(skb)->tso_size = tp->mss_cache_std;
-	}
-}
-
-/* Does SKB fit into the send window? */
-static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
-{
-	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
-
-	return !after(end_seq, tp->snd_una + tp->snd_wnd);
-}
-
-/* Can at least one segment of SKB be sent right now, according to the
- * congestion window rules?  If so, return how many segments are allowed.
- */
-static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
-{
-	u32 in_flight, cwnd;
-
-	/* Don't be strict about the congestion window for the final FIN.  */
-	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
-		return 1;
-
-	in_flight = tcp_packets_in_flight(tp);
-	cwnd = tp->snd_cwnd;
-	if (in_flight < cwnd)
-		return (cwnd - in_flight);
-
-	return 0;
-}
-
-static inline int tcp_minshall_check(const struct tcp_sock *tp)
-{
-	return after(tp->snd_sml,tp->snd_una) &&
-		!after(tp->snd_sml, tp->snd_nxt);
-}
-
-/* Return 0, if packet can be sent now without violation Nagle's rules:
- * 1. It is full sized.
- * 2. Or it contains FIN. (already checked by caller)
- * 3. Or TCP_NODELAY was set.
- * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
- *    With Minshall's modification: all sent small packets are ACKed.
- */
-
-static inline int tcp_nagle_check(const struct tcp_sock *tp,
-				  const struct sk_buff *skb, 
-				  unsigned mss_now, int nonagle)
-{
-	return (skb->len < mss_now &&
-		((nonagle&TCP_NAGLE_CORK) ||
-		 (!nonagle &&
-		  tp->packets_out &&
-		  tcp_minshall_check(tp))));
-}
-
-/* Return non-zero if the Nagle test allows this packet to be
- * sent now.
- */
-static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
-				 unsigned int cur_mss, int nonagle)
-{
-	/* Nagle rule does not apply to frames, which sit in the middle of the
-	 * write_queue (they have no chances to get new data).
-	 *
-	 * This is implemented in the callers, where they modify the 'nonagle'
-	 * argument based upon the location of SKB in the send queue.
-	 */
-	if (nonagle & TCP_NAGLE_PUSH)
-		return 1;
-
-	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
-	if (tp->urg_mode ||
-	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
-		return 1;
-
-	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
-		return 1;
-
-	return 0;
-}
-
-/* This must be invoked the first time we consider transmitting
- * SKB onto the wire.
- */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
-{
-	int tso_segs = tcp_skb_pcount(skb);
-
-	if (!tso_segs) {
-		tcp_set_skb_tso_segs(sk, skb);
-		tso_segs = tcp_skb_pcount(skb);
-	}
-	return tso_segs;
-}
-
-/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
- * should be put on the wire right now.  If so, it returns the number of
- * packets allowed by the congestion window.
- */
-static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
-				 unsigned int cur_mss, int nonagle)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int cwnd_quota;
-
-	tcp_init_tso_segs(sk, skb);
-
-	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
-		return 0;
-
-	cwnd_quota = tcp_cwnd_test(tp, skb);
-	if (cwnd_quota &&
-	    !tcp_snd_wnd_test(tp, skb, cur_mss))
-		cwnd_quota = 0;
-
-	return cwnd_quota;
-}
-
-static inline int tcp_skb_is_last(const struct sock *sk, 
-				  const struct sk_buff *skb)
-{
-	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
-}
-
-int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
-{
-	struct sk_buff *skb = sk->sk_send_head;
-
-	return (skb &&
-		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
-			     (tcp_skb_is_last(sk, skb) ?
-			      TCP_NAGLE_PUSH :
-			      tp->nonagle)));
-}
-
-
-/* Send _single_ skb sitting at the send head. This function requires
- * true push pending frames to setup probe timer etc.
- */
-void tcp_push_one(struct sock *sk, unsigned cur_mss)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb = sk->sk_send_head;
-
-	if (tcp_snd_test(sk, skb, cur_mss, TCP_NAGLE_PUSH)) {
-		/* Send it out now. */
-		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tcp_tso_set_push(skb);
-		if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
-			sk->sk_send_head = NULL;
-			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
-			tcp_packets_out_inc(sk, tp, skb);
-			return;
-		}
+		skb_shinfo(skb)->tso_size = tp->mss_cache;
 	}
 }
 
@@ -791,7 +627,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 
 	/* And store cached results */
 	tp->pmtu_cookie = pmtu;
-	tp->mss_cache = tp->mss_cache_std = mss_now;
+	tp->mss_cache = mss_now;
 
 	return mss_now;
 }
@@ -803,56 +639,47 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  * cannot be large. However, taking into account rare use of URG, this
  * is not a big flaw.
  */
-
-unsigned int tcp_current_mss(struct sock *sk, int large)
+unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct dst_entry *dst = __sk_dst_get(sk);
-	unsigned int do_large, mss_now;
+	u32 mss_now;
+	u16 xmit_size_goal;
+	int doing_tso = 0;
+
+	mss_now = tp->mss_cache;
+
+	if (large_allowed &&
+	    (sk->sk_route_caps & NETIF_F_TSO) &&
+	    !tp->urg_mode)
+		doing_tso = 1;
 
-	mss_now = tp->mss_cache_std;
 	if (dst) {
 		u32 mtu = dst_mtu(dst);
 		if (mtu != tp->pmtu_cookie)
 			mss_now = tcp_sync_mss(sk, mtu);
 	}
 
-	do_large = (large &&
-		    (sk->sk_route_caps & NETIF_F_TSO) &&
-		    !tp->urg_mode);
+	if (tp->rx_opt.eff_sacks)
+		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 
-	if (do_large) {
-		unsigned int large_mss, factor, limit;
+	xmit_size_goal = mss_now;
 
-		large_mss = 65535 - tp->af_specific->net_header_len -
+	if (doing_tso) {
+		xmit_size_goal = 65535 -
+			tp->af_specific->net_header_len -
 			tp->ext_header_len - tp->tcp_header_len;
 
-		if (tp->max_window && large_mss > (tp->max_window>>1))
-			large_mss = max((tp->max_window>>1),
-					68U - tp->tcp_header_len);
-
-		factor = large_mss / mss_now;
+		if (tp->max_window &&
+		    (xmit_size_goal > (tp->max_window >> 1)))
+			xmit_size_goal = max((tp->max_window >> 1),
+					     68U - tp->tcp_header_len);
 
-		/* Always keep large mss multiple of real mss, but
-		 * do not exceed 1/tso_win_divisor of the congestion window
-		 * so we can keep the ACK clock ticking and minimize
-		 * bursting.
-		 */
-		limit = tp->snd_cwnd;
-		if (sysctl_tcp_tso_win_divisor)
-			limit /= sysctl_tcp_tso_win_divisor;
-		limit = max(1U, limit);
-		if (factor > limit)
-			factor = limit;
-
-		tp->mss_cache = mss_now * factor;
-
-		mss_now = tp->mss_cache;
+		xmit_size_goal -= (xmit_size_goal % mss_now);
 	}
+	tp->xmit_size_goal = xmit_size_goal;
 
-	if (tp->rx_opt.eff_sacks)
-		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
-			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 	return mss_now;
 }
 
@@ -876,6 +703,251 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 	}
 }
 
+static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd)
+{
+	u32 window, cwnd_len;
+
+	window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq);
+	cwnd_len = mss_now * cwnd;
+	return min(window, cwnd_len);
+}
+
+/* Can at least one segment of SKB be sent right now, according to the
+ * congestion window rules?  If so, return how many segments are allowed.
+ */
+static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 in_flight, cwnd;
+
+	/* Don't be strict about the congestion window for the final FIN.  */
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 1;
+
+	in_flight = tcp_packets_in_flight(tp);
+	cwnd = tp->snd_cwnd;
+	if (in_flight < cwnd)
+		return (cwnd - in_flight);
+
+	return 0;
+}
+
+/* This must be invoked the first time we consider transmitting
+ * SKB onto the wire.
+ */
+static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb)
+{
+	int tso_segs = tcp_skb_pcount(skb);
+
+	if (!tso_segs) {
+		tcp_set_skb_tso_segs(sk, skb);
+		tso_segs = tcp_skb_pcount(skb);
+	}
+	return tso_segs;
+}
+
+static inline int tcp_minshall_check(const struct tcp_sock *tp)
+{
+	return after(tp->snd_sml,tp->snd_una) &&
+		!after(tp->snd_sml, tp->snd_nxt);
+}
+
+/* Return 0, if packet can be sent now without violation Nagle's rules:
+ * 1. It is full sized.
+ * 2. Or it contains FIN. (already checked by caller)
+ * 3. Or TCP_NODELAY was set.
+ * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
+ *    With Minshall's modification: all sent small packets are ACKed.
+ */
+
+static inline int tcp_nagle_check(const struct tcp_sock *tp,
+				  const struct sk_buff *skb, 
+				  unsigned mss_now, int nonagle)
+{
+	return (skb->len < mss_now &&
+		((nonagle&TCP_NAGLE_CORK) ||
+		 (!nonagle &&
+		  tp->packets_out &&
+		  tcp_minshall_check(tp))));
+}
+
+/* Return non-zero if the Nagle test allows this packet to be
+ * sent now.
+ */
+static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	/* Nagle rule does not apply to frames, which sit in the middle of the
+	 * write_queue (they have no chances to get new data).
+	 *
+	 * This is implemented in the callers, where they modify the 'nonagle'
+	 * argument based upon the location of SKB in the send queue.
+	 */
+	if (nonagle & TCP_NAGLE_PUSH)
+		return 1;
+
+	/* Don't use the nagle rule for urgent data (or for the final FIN).  */
+	if (tp->urg_mode ||
+	    (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
+		return 1;
+
+	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
+		return 1;
+
+	return 0;
+}
+
+/* Does at least the first segment of SKB fit into the send window? */
+static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss)
+{
+	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+	if (skb->len > cur_mss)
+		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
+
+	return !after(end_seq, tp->snd_una + tp->snd_wnd);
+}
+
+/* This checks if the data bearing packet SKB (usually sk->sk_send_head)
+ * should be put on the wire right now.  If so, it returns the number of
+ * packets allowed by the congestion window.
+ */
+static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
+				 unsigned int cur_mss, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int cwnd_quota;
+
+	tcp_init_tso_segs(sk, skb);
+
+	if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
+		return 0;
+
+	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (cwnd_quota &&
+	    !tcp_snd_wnd_test(tp, skb, cur_mss))
+		cwnd_quota = 0;
+
+	return cwnd_quota;
+}
+
+static inline int tcp_skb_is_last(const struct sock *sk, 
+				  const struct sk_buff *skb)
+{
+	return skb->next == (struct sk_buff *)&sk->sk_write_queue;
+}
+
+int tcp_may_send_now(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	return (skb &&
+		tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+			     (tcp_skb_is_last(sk, skb) ?
+			      TCP_NAGLE_PUSH :
+			      tp->nonagle)));
+}
+
+/* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
+ * which is put after SKB on the list.  It is very much like
+ * tcp_fragment() except that it may make several kinds of assumptions
+ * in order to speed up the splitting operation.  In particular, we
+ * know that all the data is in scatter-gather pages, and that the
+ * packet has never been sent out before (and thus is not cloned).
+ */
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len)
+{
+	struct sk_buff *buff;
+	int nlen = skb->len - len;
+	u16 flags;
+
+	/* All of a TSO frame must be composed of paged data.  */
+	BUG_ON(skb->len != skb->data_len);
+
+	buff = sk_stream_alloc_pskb(sk, 0, 0, GFP_ATOMIC);
+	if (unlikely(buff == NULL))
+		return -ENOMEM;
+
+	buff->truesize = nlen;
+	skb->truesize -= nlen;
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+
+	/* This packet was never sent out yet, so no SACK bits. */
+	TCP_SKB_CB(buff)->sacked = 0;
+
+	buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+	skb_split(skb, buff, len);
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(sk, skb);
+	tcp_set_skb_tso_segs(sk, buff);
+
+	/* Link BUFF into the send queue. */
+	skb_header_release(buff);
+	__skb_append(skb, buff);
+
+	return 0;
+}
+
+/* Try to defer sending, if possible, in order to minimize the amount
+ * of TSO splitting we do.  View it as a kind of TSO Nagle test.
+ *
+ * This algorithm is from John Heffner.
+ */
+static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 send_win, cong_win, limit, in_flight;
+
+	if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+		return 0;
+
+	in_flight = tcp_packets_in_flight(tp);
+
+	BUG_ON(tcp_skb_pcount(skb) <= 1 ||
+	       (tp->snd_cwnd <= in_flight));
+
+	send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq;
+
+	/* From in_flight test above, we know that cwnd > in_flight.  */
+	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+
+	limit = min(send_win, cong_win);
+
+	/* If sk_send_head can be sent fully now, just do it.  */
+	if (skb->len <= limit)
+		return 0;
+
+	if (sysctl_tcp_tso_win_divisor) {
+		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+
+		/* If at least some fraction of a window is available,
+		 * just use it.
+		 */
+		chunk /= sysctl_tcp_tso_win_divisor;
+		if (limit >= chunk)
+			return 0;
+	} else {
+		/* Different approach, try not to defer past a single
+		 * ACK.  Receiver should ACK every other full sized
+		 * frame, so if we have space for more than 3 frames
+		 * then send now.
+		 */
+		if (limit > tcp_max_burst(tp) * tp->mss_cache)
+			return 0;
+	}
+
+	/* Ok, it looks like it is advisable to defer.  */
+	return 1;
+}
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -887,8 +959,8 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-	unsigned int tso_segs, cwnd_quota;
-	int sent_pkts;
+	unsigned int tso_segs, sent_pkts;
+	int cwnd_quota;
 
 	/* If we are closed, the bytes will have to remain here.
 	 * In time closedown will finish, we empty the write queue and all
@@ -903,24 +975,44 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
 	tso_segs = tcp_init_tso_segs(sk, skb);
 	cwnd_quota = tcp_cwnd_test(tp, skb);
+	if (unlikely(!cwnd_quota))
+		goto out;
+
 	sent_pkts = 0;
+	while (likely(tcp_snd_wnd_test(tp, skb, mss_now))) {
+		BUG_ON(!tso_segs);
 
-	while (cwnd_quota >= tso_segs) {
-		if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
-					     (tcp_skb_is_last(sk, skb) ?
-					      nonagle : TCP_NAGLE_PUSH))))
-			break;
+		if (tso_segs == 1) {
+			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
+						     (tcp_skb_is_last(sk, skb) ?
+						      nonagle : TCP_NAGLE_PUSH))))
+				break;
+		} else {
+			if (tcp_tso_should_defer(sk, tp, skb))
+				break;
+		}
 
-		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
-			break;
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
 
-		if (unlikely(skb->len > mss_now)) {
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (tso_fragment(sk, skb, limit))
+					break;
+			}
+		} else if (unlikely(skb->len > mss_now)) {
 			if (unlikely(tcp_fragment(sk, skb,  mss_now)))
 				break;
 		}
 
 		TCP_SKB_CB(skb)->when = tcp_time_stamp;
-		tcp_tso_set_push(skb);
+
 		if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
 			break;
 
@@ -936,6 +1028,11 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		 * the packet above, tso_segs will no longer be valid.
 		 */
 		cwnd_quota -= tcp_skb_pcount(skb);
+
+		BUG_ON(cwnd_quota < 0);
+		if (!cwnd_quota)
+			break;
+
 		skb = sk->sk_send_head;
 		if (!skb)
 			break;
@@ -946,7 +1043,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 		tcp_cwnd_validate(sk, tp);
 		return 0;
 	}
-
+out:
 	return !tp->packets_out && sk->sk_send_head;
 }
 
@@ -965,6 +1062,53 @@ void __tcp_push_pending_frames(struct sock *sk, struct tcp_sock *tp,
 	}
 }
 
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = sk->sk_send_head;
+	unsigned int tso_segs, cwnd_quota;
+
+	BUG_ON(!skb || skb->len < mss_now);
+
+	tso_segs = tcp_init_tso_segs(sk, skb);
+	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);
+
+	if (likely(cwnd_quota)) {
+		BUG_ON(!tso_segs);
+
+		if (tso_segs > 1) {
+			u32 limit = tcp_window_allows(tp, skb,
+						      mss_now, cwnd_quota);
+
+			if (skb->len < limit) {
+				unsigned int trim = skb->len % mss_now;
+
+				if (trim)
+					limit = skb->len - trim;
+			}
+			if (skb->len > limit) {
+				if (unlikely(tso_fragment(sk, skb, limit)))
+					return;
+			}
+		} else if (unlikely(skb->len > mss_now)) {
+			if (unlikely(tcp_fragment(sk, skb, mss_now)))
+				return;
+		}
+
+		/* Send it out now. */
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+
+		if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+			update_send_head(sk, tp, skb);
+			tcp_cwnd_validate(sk, tp);
+			return;
+		}
+	}
+}
+
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
  *  
@@ -1222,7 +1366,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 		if (sk->sk_route_caps & NETIF_F_TSO) {
 			sk->sk_route_caps &= ~NETIF_F_TSO;
 			sock_set_flag(sk, SOCK_NO_LARGESEND);
-			tp->mss_cache = tp->mss_cache_std;
 		}
 
 		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
@@ -1284,7 +1427,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 	 * is still in somebody's hands, else make a clone.
 	 */
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_tso_set_push(skb);
 
 	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
 				    pskb_copy(skb, GFP_ATOMIC):
@@ -1853,14 +1995,12 @@ int tcp_write_wakeup(struct sock *sk)
 				if (sk->sk_route_caps & NETIF_F_TSO) {
 					sock_set_flag(sk, SOCK_NO_LARGESEND);
 					sk->sk_route_caps &= ~NETIF_F_TSO;
-					tp->mss_cache = tp->mss_cache_std;
 				}
 			} else if (!tcp_skb_pcount(skb))
 				tcp_set_skb_tso_segs(sk, skb);
 
 			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 			TCP_SKB_CB(skb)->when = tcp_time_stamp;
-			tcp_tso_set_push(skb);
 			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
 			if (!err) {
 				update_send_head(sk, tp, skb);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9dac7fdf4726..f6e288dc116e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2018,7 +2018,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	 */
 	tp->snd_ssthresh = 0x7fffffff;
 	tp->snd_cwnd_clamp = ~0;
-	tp->mss_cache_std = tp->mss_cache = 536;
+	tp->mss_cache = 536;
 
 	tp->reordering = sysctl_tcp_reordering;
 
-- 
cgit v1.2.3-59-g8ed1b


From 6772926bef3c9f0ec761b39e5702535471fff70b Mon Sep 17 00:00:00 2001
From: Rusty Lynch <rusty.lynch@intel.com>
Date: Tue, 5 Jul 2005 18:54:50 -0700
Subject: [PATCH] kprobes: fix namespace problem and sparc64 build

The following renames arch_init, a kprobes function for performing any
architecture specific initialization, to arch_init_kprobes in order to
cleanup the namespace.

Also, this patch adds arch_init_kprobes to sparc64 to fix the sparc64 kprobes
build from the last return probe patch.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    | 2 +-
 arch/ia64/kernel/kprobes.c    | 2 +-
 arch/ppc64/kernel/kprobes.c   | 2 +-
 arch/sparc64/kernel/kprobes.c | 5 +++++
 arch/x86_64/kernel/kprobes.c  | 2 +-
 include/linux/kprobes.h       | 2 +-
 kernel/kprobes.c              | 2 +-
 7 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index fc8b17521761..a6d8c45961d3 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -537,7 +537,7 @@ static struct kprobe trampoline_p = {
 	.pre_handler = trampoline_probe_handler
 };
 
-int __init arch_init(void)
+int __init arch_init_kprobes(void)
 {
 	return register_kprobe(&trampoline_p);
 }
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 3aa3167edbec..884f5cd27d8a 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -713,7 +713,7 @@ static struct kprobe trampoline_p = {
 	.pre_handler = trampoline_probe_handler
 };
 
-int __init arch_init(void)
+int __init arch_init_kprobes(void)
 {
 	trampoline_p.addr =
 		(kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip;
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 1d2ff6d6b0b3..a3d519518fb8 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -444,7 +444,7 @@ static struct kprobe trampoline_p = {
 	.pre_handler = trampoline_probe_handler
 };
 
-int __init arch_init(void)
+int __init arch_init_kprobes(void)
 {
 	return register_kprobe(&trampoline_p);
 }
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index bdac631cf011..bbf11f85dab1 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -433,3 +433,8 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return 0;
 }
 
+/* architecture specific initialization */
+int arch_init_kprobes(void)
+{
+	return 0;
+}
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index acd2a778ebe6..5c6dc7051482 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -682,7 +682,7 @@ static struct kprobe trampoline_p = {
 	.pre_handler = trampoline_probe_handler
 };
 
-int __init arch_init(void)
+int __init arch_init_kprobes(void)
 {
 	return register_kprobe(&trampoline_p);
 }
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index b7a194c4362a..e050fc2d4c26 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -155,7 +155,7 @@ extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
 extern void arch_remove_kprobe(struct kprobe *p);
-extern int arch_init(void);
+extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
 extern void free_insn_slot(kprobe_opcode_t *slot);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 90c0e82b650c..b0237122b24e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -574,7 +574,7 @@ static int __init init_kprobes(void)
 		INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
 	}
 
-	err = arch_init();
+	err = arch_init_kprobes();
 	if (!err)
 		err = register_die_notifier(&kprobe_exceptions_nb);
 
-- 
cgit v1.2.3-59-g8ed1b


From 5e6557722e69840506eb8bc5a1edcdb4e447a917 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Wed, 6 Jul 2005 15:44:41 -0400
Subject: [PATCH] openfirmware: generate device table for userspace

This converts the usage of struct of_match to struct of_device_id,
similar to pci_device_id.  This allows a device table to be generated,
which can be parsed by depmod(8) to generate a map file for module
loading.

In order for hotplug to work with macio devices, patches to
module-init-tools and hotplug must be applied.  Those patches are
available at:

 ftp://ftp.suse.com/pub/people/jeffm/linux/macio-hotplug/

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/syslib/of_device.c          | 15 ++++++++-------
 arch/ppc64/kernel/of_device.c        | 15 ++++++++-------
 drivers/i2c/busses/i2c-keywest.c     |  7 +++----
 drivers/ide/ppc/pmac.c               | 12 ++----------
 drivers/macintosh/macio_asic.c       |  4 ++--
 drivers/macintosh/mediabay.c         |  7 ++-----
 drivers/macintosh/therm_pm72.c       |  9 ++++-----
 drivers/macintosh/therm_windtunnel.c |  6 +++---
 drivers/net/bmac.c                   |  7 ++-----
 drivers/net/mace.c                   |  6 ++----
 drivers/net/wireless/airport.c       |  8 ++++----
 drivers/scsi/mac53c94.c              |  7 +++----
 drivers/scsi/mesh.c                  |  8 +++-----
 drivers/serial/pmac_zilog.c          |  9 +++------
 drivers/video/platinumfb.c           |  6 ++----
 include/asm-ppc/macio.h              |  5 +++--
 include/asm-ppc/of_device.h          | 20 ++++----------------
 include/linux/mod_devicetable.h      | 11 +++++++++++
 scripts/mod/file2alias.c             | 22 ++++++++++++++++++++++
 19 files changed, 91 insertions(+), 93 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/syslib/of_device.c b/arch/ppc/syslib/of_device.c
index 49c0e34e2d6b..1eb4f726ca9f 100644
--- a/arch/ppc/syslib/of_device.c
+++ b/arch/ppc/syslib/of_device.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mod_devicetable.h>
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
@@ -15,20 +16,20 @@
  * Used by a driver to check whether an of_device present in the
  * system is in its list of supported devices.
  */
-const struct of_match * of_match_device(const struct of_match *matches,
+const struct of_device_id * of_match_device(const struct of_device_id *matches,
 					const struct of_device *dev)
 {
 	if (!dev->node)
 		return NULL;
-	while (matches->name || matches->type || matches->compatible) {
+	while (matches->name[0] || matches->type[0] || matches->compatible[0]) {
 		int match = 1;
-		if (matches->name && matches->name != OF_ANY_MATCH)
+		if (matches->name[0])
 			match &= dev->node->name
 				&& !strcmp(matches->name, dev->node->name);
-		if (matches->type && matches->type != OF_ANY_MATCH)
+		if (matches->type[0])
 			match &= dev->node->type
 				&& !strcmp(matches->type, dev->node->type);
-		if (matches->compatible && matches->compatible != OF_ANY_MATCH)
+		if (matches->compatible[0])
 			match &= device_is_compatible(dev->node,
 				matches->compatible);
 		if (match)
@@ -42,7 +43,7 @@ static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct of_device * of_dev = to_of_device(dev);
 	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
-	const struct of_match * matches = of_drv->match_table;
+	const struct of_device_id * matches = of_drv->match_table;
 
 	if (!matches)
 		return 0;
@@ -75,7 +76,7 @@ static int of_device_probe(struct device *dev)
 	int error = -ENODEV;
 	struct of_platform_driver *drv;
 	struct of_device *of_dev;
-	const struct of_match *match;
+	const struct of_device_id *match;
 
 	drv = to_of_platform_driver(dev->driver);
 	of_dev = to_of_device(dev);
diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c
index 66bd5ab7c25a..b80e81984ba8 100644
--- a/arch/ppc64/kernel/of_device.c
+++ b/arch/ppc64/kernel/of_device.c
@@ -3,6 +3,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/mod_devicetable.h>
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
@@ -15,20 +16,20 @@
  * Used by a driver to check whether an of_device present in the
  * system is in its list of supported devices.
  */
-const struct of_match * of_match_device(const struct of_match *matches,
+const struct of_device_id *of_match_device(const struct of_device_id *matches,
 					const struct of_device *dev)
 {
 	if (!dev->node)
 		return NULL;
-	while (matches->name || matches->type || matches->compatible) {
+	while (matches->name[0] || matches->type[0] || matches->compatible[0]) {
 		int match = 1;
-		if (matches->name && matches->name != OF_ANY_MATCH)
+		if (matches->name[0])
 			match &= dev->node->name
 				&& !strcmp(matches->name, dev->node->name);
-		if (matches->type && matches->type != OF_ANY_MATCH)
+		if (matches->type[0])
 			match &= dev->node->type
 				&& !strcmp(matches->type, dev->node->type);
-		if (matches->compatible && matches->compatible != OF_ANY_MATCH)
+		if (matches->compatible[0])
 			match &= device_is_compatible(dev->node,
 				matches->compatible);
 		if (match)
@@ -42,7 +43,7 @@ static int of_platform_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct of_device * of_dev = to_of_device(dev);
 	struct of_platform_driver * of_drv = to_of_platform_driver(drv);
-	const struct of_match * matches = of_drv->match_table;
+	const struct of_device_id * matches = of_drv->match_table;
 
 	if (!matches)
 		return 0;
@@ -75,7 +76,7 @@ static int of_device_probe(struct device *dev)
 	int error = -ENODEV;
 	struct of_platform_driver *drv;
 	struct of_device *of_dev;
-	const struct of_match *match;
+	const struct of_device_id *match;
 
 	drv = to_of_platform_driver(dev->driver);
 	of_dev = to_of_device(dev);
diff --git a/drivers/i2c/busses/i2c-keywest.c b/drivers/i2c/busses/i2c-keywest.c
index 363e545fc01f..94ae808314f7 100644
--- a/drivers/i2c/busses/i2c-keywest.c
+++ b/drivers/i2c/busses/i2c-keywest.c
@@ -698,7 +698,7 @@ dispose_iface(struct device *dev)
 }
 
 static int
-create_iface_macio(struct macio_dev* dev, const struct of_match *match)
+create_iface_macio(struct macio_dev* dev, const struct of_device_id *match)
 {
 	return create_iface(dev->ofdev.node, &dev->ofdev.dev);
 }
@@ -710,7 +710,7 @@ dispose_iface_macio(struct macio_dev* dev)
 }
 
 static int
-create_iface_of_platform(struct of_device* dev, const struct of_match *match)
+create_iface_of_platform(struct of_device* dev, const struct of_device_id *match)
 {
 	return create_iface(dev->node, &dev->dev);
 }
@@ -721,10 +721,9 @@ dispose_iface_of_platform(struct of_device* dev)
 	return dispose_iface(&dev->dev);
 }
 
-static struct of_match i2c_keywest_match[] = 
+static struct of_device_id i2c_keywest_match[] = 
 {
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "i2c",
 	.compatible	= "keywest"
 	},
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index 818380b5fd27..be0fcc8f4b15 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1419,7 +1419,7 @@ pmac_ide_setup_device(pmac_ide_hwif_t *pmif, ide_hwif_t *hwif)
  * Attach to a macio probed interface
  */
 static int __devinit
-pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_match *match)
+pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	void __iomem *base;
 	unsigned long regbase;
@@ -1637,27 +1637,19 @@ pmac_ide_pci_resume(struct pci_dev *pdev)
 	return rc;
 }
 
-static struct of_match pmac_ide_macio_match[] = 
+static struct of_device_id pmac_ide_macio_match[] = 
 {
 	{
 	.name 		= "IDE",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{
 	.name 		= "ATA",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "ide",
-	.compatible	= OF_ANY_MATCH
 	},
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "ata",
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index d0bda7e3e6aa..37b18ee08a2d 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -33,7 +33,7 @@ static int macio_bus_match(struct device *dev, struct device_driver *drv)
 {
 	struct macio_dev * macio_dev = to_macio_device(dev);
 	struct macio_driver * macio_drv = to_macio_driver(drv);
-	const struct of_match * matches = macio_drv->match_table;
+	const struct of_device_id * matches = macio_drv->match_table;
 
 	if (!matches) 
 		return 0;
@@ -66,7 +66,7 @@ static int macio_device_probe(struct device *dev)
 	int error = -ENODEV;
 	struct macio_driver *drv;
 	struct macio_dev *macio_dev;
-	const struct of_match *match;
+	const struct of_device_id *match;
 
 	drv = to_macio_driver(dev->driver);
 	macio_dev = to_macio_device(dev);
diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index 4be709e13eec..7c16c25fc5d4 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -642,7 +642,7 @@ static int __pmac media_bay_task(void *x)
 	}
 }
 
-static int __devinit media_bay_attach(struct macio_dev *mdev, const struct of_match *match)
+static int __devinit media_bay_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	struct media_bay_info* bay;
 	u32 __iomem *regbase;
@@ -797,23 +797,20 @@ static struct mb_ops keylargo_mb_ops __pmacdata = {
  * Therefore we do it all by polling the media bay once each tick.
  */
 
-static struct of_match media_bay_match[] =
+static struct of_device_id media_bay_match[] =
 {
 	{
 	.name		= "media-bay",
-	.type		= OF_ANY_MATCH,
 	.compatible	= "keylargo-media-bay",
 	.data		= &keylargo_mb_ops,
 	},
 	{
 	.name		= "media-bay",
-	.type		= OF_ANY_MATCH,
 	.compatible	= "heathrow-media-bay",
 	.data		= &heathrow_mb_ops,
 	},
 	{
 	.name		= "media-bay",
-	.type		= OF_ANY_MATCH,
 	.compatible	= "ohare-media-bay",
 	.data		= &ohare_mb_ops,
 	},
diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c
index feb4e2413858..703e31973314 100644
--- a/drivers/macintosh/therm_pm72.c
+++ b/drivers/macintosh/therm_pm72.c
@@ -120,6 +120,7 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 #include <asm/of_device.h>
+#include <asm/macio.h>
 
 #include "therm_pm72.h"
 
@@ -1986,7 +1987,7 @@ static void fcu_lookup_fans(struct device_node *fcu_node)
 	}
 }
 
-static int fcu_of_probe(struct of_device* dev, const struct of_match *match)
+static int fcu_of_probe(struct of_device* dev, const struct of_device_id *match)
 {
 	int rc;
 
@@ -2009,12 +2010,10 @@ static int fcu_of_remove(struct of_device* dev)
 	return 0;
 }
 
-static struct of_match fcu_of_match[] = 
+static struct of_device_id fcu_match[] = 
 {
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "fcu",
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
@@ -2022,7 +2021,7 @@ static struct of_match fcu_of_match[] =
 static struct of_platform_driver fcu_of_platform_driver = 
 {
 	.name 		= "temperature",
-	.match_table	= fcu_of_match,
+	.match_table	= fcu_match,
 	.probe		= fcu_of_probe,
 	.remove		= fcu_of_remove
 };
diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c
index 61400f04015e..cbb72eb0426d 100644
--- a/drivers/macintosh/therm_windtunnel.c
+++ b/drivers/macintosh/therm_windtunnel.c
@@ -43,6 +43,7 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 #include <asm/of_device.h>
+#include <asm/macio.h>
 
 #define LOG_TEMP		0			/* continously log temperature */
 
@@ -450,7 +451,7 @@ do_probe( struct i2c_adapter *adapter, int addr, int kind )
 /************************************************************************/
 
 static int
-therm_of_probe( struct of_device *dev, const struct of_match *match )
+therm_of_probe( struct of_device *dev, const struct of_device_id *match )
 {
 	return i2c_add_driver( &g4fan_driver );
 }
@@ -461,9 +462,8 @@ therm_of_remove( struct of_device *dev )
 	return i2c_del_driver( &g4fan_driver );
 }
 
-static struct of_match therm_of_match[] = {{
+static struct of_device_id therm_of_match[] = {{
 	.name		= "fan",
-	.type		= OF_ANY_MATCH,
 	.compatible	= "adm1030"
     }, {}
 };
diff --git a/drivers/net/bmac.c b/drivers/net/bmac.c
index 00e5257b176f..8dc657fc8afb 100644
--- a/drivers/net/bmac.c
+++ b/drivers/net/bmac.c
@@ -1261,7 +1261,7 @@ static void bmac_reset_and_enable(struct net_device *dev)
 	spin_unlock_irqrestore(&bp->lock, flags);
 }
 
-static int __devinit bmac_probe(struct macio_dev *mdev, const struct of_match *match)
+static int __devinit bmac_probe(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	int j, rev, ret;
 	struct bmac_data *bp;
@@ -1645,16 +1645,13 @@ static int __devexit bmac_remove(struct macio_dev *mdev)
 	return 0;
 }
 
-static struct of_match bmac_match[] = 
+static struct of_device_id bmac_match[] = 
 {
 	{
 	.name 		= "bmac",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH,
 	.data		= (void *)0,
 	},
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "network",
 	.compatible	= "bmac+",
 	.data		= (void *)1,
diff --git a/drivers/net/mace.c b/drivers/net/mace.c
index 6ed2d7dbd44c..81d0a26e4f41 100644
--- a/drivers/net/mace.c
+++ b/drivers/net/mace.c
@@ -109,7 +109,7 @@ bitrev(int b)
 }
 
 
-static int __devinit mace_probe(struct macio_dev *mdev, const struct of_match *match)
+static int __devinit mace_probe(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	struct device_node *mace = macio_get_of_node(mdev);
 	struct net_device *dev;
@@ -1009,12 +1009,10 @@ static irqreturn_t mace_rxdma_intr(int irq, void *dev_id, struct pt_regs *regs)
     return IRQ_HANDLED;
 }
 
-static struct of_match mace_match[] = 
+static struct of_device_id mace_match[] = 
 {
 	{
 	.name 		= "mace",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
diff --git a/drivers/net/wireless/airport.c b/drivers/net/wireless/airport.c
index b4f4bd7956a2..9d496703c465 100644
--- a/drivers/net/wireless/airport.c
+++ b/drivers/net/wireless/airport.c
@@ -184,7 +184,7 @@ static int airport_hard_reset(struct orinoco_private *priv)
 }
 
 static int
-airport_attach(struct macio_dev *mdev, const struct of_match *match)
+airport_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	struct orinoco_private *priv;
 	struct net_device *dev;
@@ -266,16 +266,16 @@ MODULE_AUTHOR("Benjamin Herrenschmidt <benh@kernel.crashing.org>");
 MODULE_DESCRIPTION("Driver for the Apple Airport wireless card.");
 MODULE_LICENSE("Dual MPL/GPL");
 
-static struct of_match airport_match[] = 
+static struct of_device_id airport_match[] = 
 {
 	{
 	.name 		= "radio",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
 
+MODULE_DEVICE_TABLE (of, airport_match);
+
 static struct macio_driver airport_driver = 
 {
 	.name 		= DRIVER_NAME,
diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c
index edd47d1f0b17..932dcf0366eb 100644
--- a/drivers/scsi/mac53c94.c
+++ b/drivers/scsi/mac53c94.c
@@ -424,7 +424,7 @@ static struct scsi_host_template mac53c94_template = {
 	.use_clustering	= DISABLE_CLUSTERING,
 };
 
-static int mac53c94_probe(struct macio_dev *mdev, const struct of_match *match)
+static int mac53c94_probe(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	struct device_node *node = macio_get_of_node(mdev);
 	struct pci_dev *pdev = macio_get_pci_dev(mdev);
@@ -544,15 +544,14 @@ static int mac53c94_remove(struct macio_dev *mdev)
 }
 
 
-static struct of_match mac53c94_match[] = 
+static struct of_device_id mac53c94_match[] = 
 {
 	{
 	.name 		= "53c94",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
+MODULE_DEVICE_TABLE (of, mac53c94_match);
 
 static struct macio_driver mac53c94_driver = 
 {
diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c
index b05737ae5eff..ff1933298da6 100644
--- a/drivers/scsi/mesh.c
+++ b/drivers/scsi/mesh.c
@@ -1847,7 +1847,7 @@ static struct scsi_host_template mesh_template = {
 	.use_clustering			= DISABLE_CLUSTERING,
 };
 
-static int mesh_probe(struct macio_dev *mdev, const struct of_match *match)
+static int mesh_probe(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	struct device_node *mesh = macio_get_of_node(mdev);
 	struct pci_dev* pdev = macio_get_pci_dev(mdev);
@@ -2012,20 +2012,18 @@ static int mesh_remove(struct macio_dev *mdev)
 }
 
 
-static struct of_match mesh_match[] = 
+static struct of_device_id mesh_match[] = 
 {
 	{
 	.name 		= "mesh",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{
-	.name 		= OF_ANY_MATCH,
 	.type		= "scsi",
 	.compatible	= "chrp,mesh0"
 	},
 	{},
 };
+MODULE_DEVICE_TABLE (of, mesh_match);
 
 static struct macio_driver mesh_driver = 
 {
diff --git a/drivers/serial/pmac_zilog.c b/drivers/serial/pmac_zilog.c
index 1c9f71617123..7db2f37532cf 100644
--- a/drivers/serial/pmac_zilog.c
+++ b/drivers/serial/pmac_zilog.c
@@ -1545,7 +1545,7 @@ static void pmz_dispose_port(struct uart_pmac_port *uap)
 /*
  * Called upon match with an escc node in the devive-tree.
  */
-static int pmz_attach(struct macio_dev *mdev, const struct of_match *match)
+static int pmz_attach(struct macio_dev *mdev, const struct of_device_id *match)
 {
 	int i;
 	
@@ -1850,20 +1850,17 @@ err_out:
 	return rc;
 }
 
-static struct of_match pmz_match[] = 
+static struct of_device_id pmz_match[] = 
 {
 	{
 	.name 		= "ch-a",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{
 	.name 		= "ch-b",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH
 	},
 	{},
 };
+MODULE_DEVICE_TABLE (of, pmz_match);
 
 static struct macio_driver pmz_driver = 
 {
diff --git a/drivers/video/platinumfb.c b/drivers/video/platinumfb.c
index 3dd1de1539d2..b00887e9851c 100644
--- a/drivers/video/platinumfb.c
+++ b/drivers/video/platinumfb.c
@@ -523,7 +523,7 @@ int __init platinumfb_setup(char *options)
 #define invalidate_cache(addr)
 #endif
 
-static int __devinit platinumfb_probe(struct of_device* odev, const struct of_match *match)
+static int __devinit platinumfb_probe(struct of_device* odev, const struct of_device_id *match)
 {
 	struct device_node	*dp = odev->node;
 	struct fb_info		*info;
@@ -647,12 +647,10 @@ static int __devexit platinumfb_remove(struct of_device* odev)
 	return 0;
 }
 
-static struct of_match platinumfb_match[] = 
+static struct of_device_id platinumfb_match[] = 
 {
 	{
 	.name 		= "platinum",
-	.type		= OF_ANY_MATCH,
-	.compatible	= OF_ANY_MATCH,
 	},
 	{},
 };
diff --git a/include/asm-ppc/macio.h b/include/asm-ppc/macio.h
index 2cafc9978607..a481b772d154 100644
--- a/include/asm-ppc/macio.h
+++ b/include/asm-ppc/macio.h
@@ -1,6 +1,7 @@
 #ifndef __MACIO_ASIC_H__
 #define __MACIO_ASIC_H__
 
+#include <linux/mod_devicetable.h>
 #include <asm/of_device.h>
 
 extern struct bus_type macio_bus_type;
@@ -120,10 +121,10 @@ static inline struct pci_dev *macio_get_pci_dev(struct macio_dev *mdev)
 struct macio_driver
 {
 	char			*name;
-	struct of_match		*match_table;
+	struct of_device_id	*match_table;
 	struct module		*owner;
 
-	int	(*probe)(struct macio_dev* dev, const struct of_match *match);
+	int	(*probe)(struct macio_dev* dev, const struct of_device_id *match);
 	int	(*remove)(struct macio_dev* dev);
 
 	int	(*suspend)(struct macio_dev* dev, pm_message_t state);
diff --git a/include/asm-ppc/of_device.h b/include/asm-ppc/of_device.h
index 7229735a7c18..4b264cfd3998 100644
--- a/include/asm-ppc/of_device.h
+++ b/include/asm-ppc/of_device.h
@@ -24,20 +24,8 @@ struct of_device
 };
 #define	to_of_device(d) container_of(d, struct of_device, dev)
 
-/*
- * Struct used for matching a device
- */
-struct of_match
-{
-	char	*name;
-	char	*type;
-	char	*compatible;
-	void	*data;
-};
-#define OF_ANY_MATCH		((char *)-1L)
-
-extern const struct of_match *of_match_device(
-	const struct of_match *matches, const struct of_device *dev);
+extern const struct of_device_id *of_match_device(
+	const struct of_device_id *matches, const struct of_device *dev);
 
 extern struct of_device *of_dev_get(struct of_device *dev);
 extern void of_dev_put(struct of_device *dev);
@@ -49,10 +37,10 @@ extern void of_dev_put(struct of_device *dev);
 struct of_platform_driver
 {
 	char			*name;
-	struct of_match		*match_table;
+	struct of_device_id	*match_table;
 	struct module		*owner;
 
-	int	(*probe)(struct of_device* dev, const struct of_match *match);
+	int	(*probe)(struct of_device* dev, const struct of_device_id *match);
 	int	(*remove)(struct of_device* dev);
 
 	int	(*suspend)(struct of_device* dev, pm_message_t state);
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 9b6d05172ed4..dce53ac1625d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -174,6 +174,17 @@ struct serio_device_id {
 	__u8 proto;
 };
 
+/*
+ * Struct used for matching a device
+ */
+struct of_device_id
+{
+	char	name[32];
+	char	type[32];
+	char	compatible[128];
+	void	*data;
+};
+
 
 /* PCMCIA */
 
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 908bff6d1eef..5180405c1a84 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -25,6 +25,8 @@ typedef Elf64_Addr	kernel_ulong_t;
 #include <stdint.h>
 #endif
 
+#include <ctype.h>
+
 typedef uint32_t	__u32;
 typedef uint16_t	__u16;
 typedef unsigned char	__u8;
@@ -323,6 +325,22 @@ static int do_pcmcia_entry(const char *filename,
 
 
+static int do_of_entry (const char *filename, struct of_device_id *of, char *alias)
+{
+    char *tmp;
+    sprintf (alias, "of:N%sT%sC%s",
+                    of->name[0] ? of->name : "*",
+                    of->type[0] ? of->type : "*",
+                    of->compatible[0] ? of->compatible : "*");
+
+    /* Replace all whitespace with underscores */
+    for (tmp = alias; tmp && *tmp; tmp++)
+        if (isspace (*tmp))
+            *tmp = '_';
+
+    return 1;
+}
+
 /* Ignore any prefix, eg. v850 prepends _ */
 static inline int sym_is(const char *symbol, const char *name)
 {
@@ -401,6 +419,10 @@ void handle_moddevtable(struct module *mod, struct elf_info *info,
 	else if (sym_is(symname, "__mod_pcmcia_device_table"))
 		do_table(symval, sym->st_size, sizeof(struct pcmcia_device_id),
 			 do_pcmcia_entry, mod);
+        else if (sym_is(symname, "__mod_of_device_table"))
+		do_table(symval, sym->st_size, sizeof(struct of_device_id),
+			 do_of_entry, mod);
+
 }
 
 /* Now add out buffered information to the generated C source */
-- 
cgit v1.2.3-59-g8ed1b


From 40725181b74be6b0e3bdc8c05bd1e0b9873ec5cc Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 6 Jul 2005 13:51:52 -0700
Subject: [CRYPTO] Add support for low-level multi-block operations

This patch adds hooks for cipher algorithms to implement multi-block
ECB/CBC operations directly.  This is expected to provide significant
performance boots to the VIA Padlock.

It could also be used for improving software implementations such as
AES where operating on multiple blocks at a time may enable certain
optimisations.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/cipher.c        | 38 ++++++++++++++++++--------------------
 crypto/internal.h      |  5 -----
 include/linux/crypto.h | 28 +++++++++++++++++++++++++++-
 3 files changed, 45 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/cipher.c b/crypto/cipher.c
index c4243345b154..54c4a560070d 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -23,14 +23,6 @@
 #include "internal.h"
 #include "scatterwalk.h"
 
-struct cipher_desc {
-	struct crypto_tfm *tfm;
-	void (*crfn)(void *ctx, u8 *dst, const u8 *src);
-	unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst,
-			     const u8 *src, unsigned int nbytes);
-	void *info;
-};
-
 static inline void xor_64(u8 *a, const u8 *b)
 {
 	((u32 *)a)[0] ^= ((u32 *)b)[0];
@@ -224,10 +216,11 @@ static int ecb_encrypt(struct crypto_tfm *tfm,
                        struct scatterlist *src, unsigned int nbytes)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_encrypt;
-	desc.prfn = ecb_process;
+	desc.crfn = cipher->cia_encrypt;
+	desc.prfn = cipher->cia_encrypt_ecb ?: ecb_process;
 
 	return crypt(&desc, dst, src, nbytes);
 }
@@ -238,10 +231,11 @@ static int ecb_decrypt(struct crypto_tfm *tfm,
 		       unsigned int nbytes)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_decrypt;
-	desc.prfn = ecb_process;
+	desc.crfn = cipher->cia_decrypt;
+	desc.prfn = cipher->cia_decrypt_ecb ?: ecb_process;
 
 	return crypt(&desc, dst, src, nbytes);
 }
@@ -252,10 +246,11 @@ static int cbc_encrypt(struct crypto_tfm *tfm,
 		       unsigned int nbytes)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_encrypt;
-	desc.prfn = cbc_process_encrypt;
+	desc.crfn = cipher->cia_encrypt;
+	desc.prfn = cipher->cia_encrypt_cbc ?: cbc_process_encrypt;
 	desc.info = tfm->crt_cipher.cit_iv;
 
 	return crypt(&desc, dst, src, nbytes);
@@ -267,10 +262,11 @@ static int cbc_encrypt_iv(struct crypto_tfm *tfm,
                           unsigned int nbytes, u8 *iv)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_encrypt;
-	desc.prfn = cbc_process_encrypt;
+	desc.crfn = cipher->cia_encrypt;
+	desc.prfn = cipher->cia_encrypt_cbc ?: cbc_process_encrypt;
 	desc.info = iv;
 
 	return crypt(&desc, dst, src, nbytes);
@@ -282,10 +278,11 @@ static int cbc_decrypt(struct crypto_tfm *tfm,
 		       unsigned int nbytes)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_decrypt;
-	desc.prfn = cbc_process_decrypt;
+	desc.crfn = cipher->cia_decrypt;
+	desc.prfn = cipher->cia_decrypt_cbc ?: cbc_process_decrypt;
 	desc.info = tfm->crt_cipher.cit_iv;
 
 	return crypt(&desc, dst, src, nbytes);
@@ -297,10 +294,11 @@ static int cbc_decrypt_iv(struct crypto_tfm *tfm,
                           unsigned int nbytes, u8 *iv)
 {
 	struct cipher_desc desc;
+	struct cipher_alg *cipher = &tfm->__crt_alg->cra_cipher;
 
 	desc.tfm = tfm;
-	desc.crfn = tfm->__crt_alg->cra_cipher.cia_decrypt;
-	desc.prfn = cbc_process_decrypt;
+	desc.crfn = cipher->cia_decrypt;
+	desc.prfn = cipher->cia_decrypt_cbc ?: cbc_process_decrypt;
 	desc.info = iv;
 
 	return crypt(&desc, dst, src, nbytes);
diff --git a/crypto/internal.h b/crypto/internal.h
index 964b9a60ca24..5ed383f7dce6 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -42,11 +42,6 @@ static inline void crypto_yield(struct crypto_tfm *tfm)
 		cond_resched();
 }
 
-static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
-{
-	return (void *)&tfm[1];
-}
-
 struct crypto_alg *crypto_alg_lookup(const char *name);
 
 /* A far more intelligent version of this is planned.  For now, just
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 387da6a3e58c..26ce01c25745 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -61,6 +61,15 @@
 #define CRYPTO_DIR_DECRYPT		0
 
 struct scatterlist;
+struct crypto_tfm;
+
+struct cipher_desc {
+	struct crypto_tfm *tfm;
+	void (*crfn)(void *ctx, u8 *dst, const u8 *src);
+	unsigned int (*prfn)(const struct cipher_desc *desc, u8 *dst,
+			     const u8 *src, unsigned int nbytes);
+	void *info;
+};
 
 /*
  * Algorithms: modular crypto algorithm implementations, managed
@@ -73,6 +82,19 @@ struct cipher_alg {
 	                  unsigned int keylen, u32 *flags);
 	void (*cia_encrypt)(void *ctx, u8 *dst, const u8 *src);
 	void (*cia_decrypt)(void *ctx, u8 *dst, const u8 *src);
+
+	unsigned int (*cia_encrypt_ecb)(const struct cipher_desc *desc,
+					u8 *dst, const u8 *src,
+					unsigned int nbytes);
+	unsigned int (*cia_decrypt_ecb)(const struct cipher_desc *desc,
+					u8 *dst, const u8 *src,
+					unsigned int nbytes);
+	unsigned int (*cia_encrypt_cbc)(const struct cipher_desc *desc,
+					u8 *dst, const u8 *src,
+					unsigned int nbytes);
+	unsigned int (*cia_decrypt_cbc)(const struct cipher_desc *desc,
+					u8 *dst, const u8 *src,
+					unsigned int nbytes);
 };
 
 struct digest_alg {
@@ -136,7 +158,6 @@ static inline int crypto_alg_available(const char *name, u32 flags)
  * and core processing logic.  Managed via crypto_alloc_tfm() and
  * crypto_free_tfm(), as well as the various helpers below.
  */
-struct crypto_tfm;
 
 struct cipher_tfm {
 	void *cit_iv;
@@ -266,6 +287,11 @@ static inline unsigned int crypto_tfm_alg_digestsize(struct crypto_tfm *tfm)
 	return tfm->__crt_alg->cra_digest.dia_digestsize;
 }
 
+static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
+{
+	return (void *)&tfm[1];
+}
+
 /*
  * API wrappers.
  */
-- 
cgit v1.2.3-59-g8ed1b


From 95477377995aefa2ec1654a9a3777bd57ea99146 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 6 Jul 2005 13:52:09 -0700
Subject: [CRYPTO] Add alignmask for low-level cipher implementations

The VIA Padlock device requires the input and output buffers to
be aligned on 16-byte boundaries.  This patch adds the alignmask
attribute for low-level cipher implementations to indicate their
alignment requirements.

The mid-level crypt() function will copy the input/output buffers
if they are not aligned correctly before they are passed to the
low-level implementation.

Strictly speaking, some of the software implementations require
the buffers to be aligned on 4-byte boundaries as they do 32-bit
loads.  However, it is not clear whether it is better to copy
the buffers or pay the penalty for unaligned loads/stores.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/api.c           |  6 ++++++
 crypto/cipher.c        | 43 ++++++++++++++++++++++++++++++++++++-------
 crypto/scatterwalk.h   |  6 ++++++
 include/linux/crypto.h |  1 +
 4 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 394169a8577d..f55856b21992 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -168,6 +168,12 @@ int crypto_register_alg(struct crypto_alg *alg)
 {
 	int ret = 0;
 	struct crypto_alg *q;
+
+	if (alg->cra_alignmask & (alg->cra_alignmask + 1))
+		return -EINVAL;
+
+	if (alg->cra_alignmask > PAGE_SIZE)
+		return -EINVAL;
 	
 	down_write(&crypto_alg_sem);
 	
diff --git a/crypto/cipher.c b/crypto/cipher.c
index 54c4a560070d..85eb12f8e564 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -41,8 +41,10 @@ static unsigned int crypt_slow(const struct cipher_desc *desc,
 			       struct scatter_walk *in,
 			       struct scatter_walk *out, unsigned int bsize)
 {
-	u8 src[bsize];
-	u8 dst[bsize];
+	unsigned int alignmask = desc->tfm->__crt_alg->cra_alignmask;
+	u8 buffer[bsize * 2 + alignmask];
+	u8 *src = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
+	u8 *dst = src + bsize;
 	unsigned int n;
 
 	n = scatterwalk_copychunks(src, in, bsize, 0);
@@ -59,15 +61,24 @@ static unsigned int crypt_slow(const struct cipher_desc *desc,
 static inline unsigned int crypt_fast(const struct cipher_desc *desc,
 				      struct scatter_walk *in,
 				      struct scatter_walk *out,
-				      unsigned int nbytes)
+				      unsigned int nbytes, u8 *tmp)
 {
 	u8 *src, *dst;
 
 	src = in->data;
 	dst = scatterwalk_samebuf(in, out) ? src : out->data;
 
+	if (tmp) {
+		memcpy(tmp, in->data, nbytes);
+		src = tmp;
+		dst = tmp;
+	}
+
 	nbytes = desc->prfn(desc, dst, src, nbytes);
 
+	if (tmp)
+		memcpy(out->data, tmp, nbytes);
+
 	scatterwalk_advance(in, nbytes);
 	scatterwalk_advance(out, nbytes);
 
@@ -87,6 +98,8 @@ static int crypt(const struct cipher_desc *desc,
 	struct scatter_walk walk_in, walk_out;
 	struct crypto_tfm *tfm = desc->tfm;
 	const unsigned int bsize = crypto_tfm_alg_blocksize(tfm);
+	unsigned int alignmask = tfm->__crt_alg->cra_alignmask;
+	unsigned long buffer = 0;
 
 	if (!nbytes)
 		return 0;
@@ -100,16 +113,27 @@ static int crypt(const struct cipher_desc *desc,
 	scatterwalk_start(&walk_out, dst);
 
 	for(;;) {
-		unsigned int n;
+		unsigned int n = nbytes;
+		u8 *tmp = NULL;
+
+		if (!scatterwalk_aligned(&walk_in, alignmask) ||
+		    !scatterwalk_aligned(&walk_out, alignmask)) {
+			if (!buffer) {
+				buffer = __get_free_page(GFP_ATOMIC);
+				if (!buffer)
+					n = 0;
+			}
+			tmp = (u8 *)buffer;
+		}
 
 		scatterwalk_map(&walk_in, 0);
 		scatterwalk_map(&walk_out, 1);
 
-		n = scatterwalk_clamp(&walk_in, nbytes);
+		n = scatterwalk_clamp(&walk_in, n);
 		n = scatterwalk_clamp(&walk_out, n);
 
 		if (likely(n >= bsize))
-			n = crypt_fast(desc, &walk_in, &walk_out, n);
+			n = crypt_fast(desc, &walk_in, &walk_out, n, tmp);
 		else
 			n = crypt_slow(desc, &walk_in, &walk_out, bsize);
 
@@ -119,10 +143,15 @@ static int crypt(const struct cipher_desc *desc,
 		scatterwalk_done(&walk_out, 1, nbytes);
 
 		if (!nbytes)
-			return 0;
+			break;
 
 		crypto_yield(tfm);
 	}
+
+	if (buffer)
+		free_page(buffer);
+
+	return 0;
 }
 
 static unsigned int cbc_process_encrypt(const struct cipher_desc *desc,
diff --git a/crypto/scatterwalk.h b/crypto/scatterwalk.h
index 5495bb970816..e79925c474a3 100644
--- a/crypto/scatterwalk.h
+++ b/crypto/scatterwalk.h
@@ -55,6 +55,12 @@ static inline void scatterwalk_advance(struct scatter_walk *walk,
 	walk->len_this_segment -= nbytes;
 }
 
+static inline unsigned int scatterwalk_aligned(struct scatter_walk *walk,
+					       unsigned int alignmask)
+{
+	return !(walk->offset & alignmask);
+}
+
 void scatterwalk_start(struct scatter_walk *walk, struct scatterlist *sg);
 int scatterwalk_copychunks(void *buf, struct scatter_walk *walk, size_t nbytes, int out);
 void scatterwalk_map(struct scatter_walk *walk, int out);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 26ce01c25745..ac9d49beecd3 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -124,6 +124,7 @@ struct crypto_alg {
 	u32 cra_flags;
 	unsigned int cra_blocksize;
 	unsigned int cra_ctxsize;
+	unsigned int cra_alignmask;
 	const char cra_name[CRYPTO_MAX_ALG_NAME];
 
 	union {
-- 
cgit v1.2.3-59-g8ed1b


From fbdae9f3e7fb57c07cb0d973f113eb25da2e8ff2 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 6 Jul 2005 13:53:29 -0700
Subject: [CRYPTO] Ensure cit_iv is aligned correctly

This patch ensures that cit_iv is aligned according to cra_alignmask
by allocating it as part of the tfm structure.  As a side effect the
crypto layer will also guarantee that the tfm ctx area has enough space
to be aligned by cra_alignmask.  This allows us to remove the extra
space reservation from the Padlock driver.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 crypto/api.c                 | 32 +++++++++++++++++++++++++++++---
 crypto/cipher.c              | 15 +++++++++------
 crypto/internal.h            | 28 ++++++++++++++++++++++++++++
 drivers/crypto/padlock-aes.c |  3 +--
 include/linux/crypto.h       |  5 +++++
 5 files changed, 72 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/crypto/api.c b/crypto/api.c
index 0b583d24f7fa..2d8d828c0ca2 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -125,20 +125,46 @@ static void crypto_exit_ops(struct crypto_tfm *tfm)
 	}
 }
 
+static unsigned int crypto_ctxsize(struct crypto_alg *alg, int flags)
+{
+	unsigned int len;
+
+	switch (alg->cra_flags & CRYPTO_ALG_TYPE_MASK) {
+	default:
+		BUG();
+
+	case CRYPTO_ALG_TYPE_CIPHER:
+		len = crypto_cipher_ctxsize(alg, flags);
+		break;
+		
+	case CRYPTO_ALG_TYPE_DIGEST:
+		len = crypto_digest_ctxsize(alg, flags);
+		break;
+		
+	case CRYPTO_ALG_TYPE_COMPRESS:
+		len = crypto_compress_ctxsize(alg, flags);
+		break;
+	}
+
+	return len + alg->cra_alignmask;
+}
+
 struct crypto_tfm *crypto_alloc_tfm(const char *name, u32 flags)
 {
 	struct crypto_tfm *tfm = NULL;
 	struct crypto_alg *alg;
+	unsigned int tfm_size;
 
 	alg = crypto_alg_mod_lookup(name);
 	if (alg == NULL)
 		goto out;
-	
-	tfm = kmalloc(sizeof(*tfm) + alg->cra_ctxsize, GFP_KERNEL);
+
+	tfm_size = sizeof(*tfm) + crypto_ctxsize(alg, flags);
+	tfm = kmalloc(tfm_size, GFP_KERNEL);
 	if (tfm == NULL)
 		goto out_put;
 
-	memset(tfm, 0, sizeof(*tfm) + alg->cra_ctxsize);
+	memset(tfm, 0, tfm_size);
 	
 	tfm->__crt_alg = alg;
 	
diff --git a/crypto/cipher.c b/crypto/cipher.c
index 85eb12f8e564..d3295ce14a57 100644
--- a/crypto/cipher.c
+++ b/crypto/cipher.c
@@ -41,7 +41,7 @@ static unsigned int crypt_slow(const struct cipher_desc *desc,
 			       struct scatter_walk *in,
 			       struct scatter_walk *out, unsigned int bsize)
 {
-	unsigned int alignmask = desc->tfm->__crt_alg->cra_alignmask;
+	unsigned int alignmask = crypto_tfm_alg_alignmask(desc->tfm);
 	u8 buffer[bsize * 2 + alignmask];
 	u8 *src = (u8 *)ALIGN((unsigned long)buffer, alignmask + 1);
 	u8 *dst = src + bsize;
@@ -98,7 +98,7 @@ static int crypt(const struct cipher_desc *desc,
 	struct scatter_walk walk_in, walk_out;
 	struct crypto_tfm *tfm = desc->tfm;
 	const unsigned int bsize = crypto_tfm_alg_blocksize(tfm);
-	unsigned int alignmask = tfm->__crt_alg->cra_alignmask;
+	unsigned int alignmask = crypto_tfm_alg_alignmask(tfm);
 	unsigned long buffer = 0;
 
 	if (!nbytes)
@@ -399,6 +399,8 @@ int crypto_init_cipher_ops(struct crypto_tfm *tfm)
 	}
 	
 	if (ops->cit_mode == CRYPTO_TFM_MODE_CBC) {
+		unsigned int align;
+		unsigned long addr;
 	    	
 	    	switch (crypto_tfm_alg_blocksize(tfm)) {
 	    	case 8:
@@ -418,9 +420,11 @@ int crypto_init_cipher_ops(struct crypto_tfm *tfm)
 	    	}
 	    	
 		ops->cit_ivsize = crypto_tfm_alg_blocksize(tfm);
-	    	ops->cit_iv = kmalloc(ops->cit_ivsize, GFP_KERNEL);
-		if (ops->cit_iv == NULL)
-			ret = -ENOMEM;
+		align = crypto_tfm_alg_alignmask(tfm) + 1;
+		addr = (unsigned long)crypto_tfm_ctx(tfm);
+		addr = ALIGN(addr, align);
+		addr += ALIGN(tfm->__crt_alg->cra_ctxsize, align);
+		ops->cit_iv = (void *)addr;
 	}
 
 out:	
@@ -429,5 +433,4 @@ out:
 
 void crypto_exit_cipher_ops(struct crypto_tfm *tfm)
 {
-	kfree(tfm->crt_cipher.cit_iv);
 }
diff --git a/crypto/internal.h b/crypto/internal.h
index 83b1b6d6d92b..68612874b5fd 100644
--- a/crypto/internal.h
+++ b/crypto/internal.h
@@ -16,6 +16,7 @@
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
+#include <linux/kernel.h>
 #include <asm/kmap_types.h>
 
 extern enum km_type crypto_km_types[];
@@ -61,6 +62,33 @@ static inline void crypto_init_proc(void)
 { }
 #endif
 
+static inline unsigned int crypto_digest_ctxsize(struct crypto_alg *alg,
+						 int flags)
+{
+	return alg->cra_ctxsize;
+}
+
+static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg,
+						 int flags)
+{
+	unsigned int len = alg->cra_ctxsize;
+	
+	switch (flags & CRYPTO_TFM_MODE_MASK) {
+	case CRYPTO_TFM_MODE_CBC:
+		len = ALIGN(len, alg->cra_alignmask + 1);
+		len += alg->cra_blocksize;
+		break;
+	}
+
+	return len;
+}
+
+static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg,
+						   int flags)
+{
+	return alg->cra_ctxsize;
+}
+
 int crypto_init_digest_flags(struct crypto_tfm *tfm, u32 flags);
 int crypto_init_cipher_flags(struct crypto_tfm *tfm, u32 flags);
 int crypto_init_compress_flags(struct crypto_tfm *tfm, u32 flags);
diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c
index d2745ff4699c..c5b58fae95f2 100644
--- a/drivers/crypto/padlock-aes.c
+++ b/drivers/crypto/padlock-aes.c
@@ -465,8 +465,7 @@ static struct crypto_alg aes_alg = {
 	.cra_name		=	"aes",
 	.cra_flags		=	CRYPTO_ALG_TYPE_CIPHER,
 	.cra_blocksize		=	AES_BLOCK_SIZE,
-	.cra_ctxsize		=	sizeof(struct aes_ctx) +
-					PADLOCK_ALIGNMENT,
+	.cra_ctxsize		=	sizeof(struct aes_ctx),
 	.cra_alignmask		=	PADLOCK_ALIGNMENT - 1,
 	.cra_module		=	THIS_MODULE,
 	.cra_list		=	LIST_HEAD_INIT(aes_alg.cra_list),
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index ac9d49beecd3..5e2bcc636a02 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -288,6 +288,11 @@ static inline unsigned int crypto_tfm_alg_digestsize(struct crypto_tfm *tfm)
 	return tfm->__crt_alg->cra_digest.dia_digestsize;
 }
 
+static inline unsigned int crypto_tfm_alg_alignmask(struct crypto_tfm *tfm)
+{
+	return tfm->__crt_alg->cra_alignmask;
+}
+
 static inline void *crypto_tfm_ctx(struct crypto_tfm *tfm)
 {
 	return (void *)&tfm[1];
-- 
cgit v1.2.3-59-g8ed1b


From 97f927a4d7dbccde0a854a62c3ea54d90bae8679 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@tglx.tec.linutronix.de>
Date: Thu, 7 Jul 2005 16:50:16 +0200
Subject: [MTD] XIP cleanup

Move the architecture dependend code into include/asm/mtd-xip.h

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 drivers/mtd/chips/cfi_cmdset_0001.c   |  2 +-
 drivers/mtd/chips/cfi_cmdset_0002.c   |  2 +-
 include/asm-arm/arch-pxa/mtd-xip.h    | 37 +++++++++++++++++++++++++++++++++++
 include/asm-arm/arch-sa1100/mtd-xip.h | 26 ++++++++++++++++++++++++
 include/asm-arm/mtd-xip.h             | 26 ++++++++++++++++++++++++
 include/linux/mtd/xip.h               | 31 ++++++++++++-----------------
 6 files changed, 104 insertions(+), 20 deletions(-)
 create mode 100644 include/asm-arm/arch-pxa/mtd-xip.h
 create mode 100644 include/asm-arm/arch-sa1100/mtd-xip.h
 create mode 100644 include/asm-arm/mtd-xip.h

(limited to 'include/linux')

diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c
index 8b1304531d8f..0cfcd88468e0 100644
--- a/drivers/mtd/chips/cfi_cmdset_0001.c
+++ b/drivers/mtd/chips/cfi_cmdset_0001.c
@@ -845,7 +845,7 @@ static void __xipram xip_enable(struct map_info *map, struct flchip *chip,
 		chip->state = FL_READY;
 	}
 	(void) map_read(map, adr);
-	asm volatile (".rep 8; nop; .endr"); /* fill instruction prefetch */
+	xip_iprefetch();
 	local_irq_enable();
 }
 
diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c
index c76c30de48fb..8505f118f2db 100644
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -600,7 +600,7 @@ static void __xipram xip_enable(struct map_info *map, struct flchip *chip,
 		chip->state = FL_READY;
 	}
 	(void) map_read(map, adr);
-	asm volatile (".rep 8; nop; .endr"); /* fill instruction prefetch */
+	xip_iprefetch();
 	local_irq_enable();
 }
 
diff --git a/include/asm-arm/arch-pxa/mtd-xip.h b/include/asm-arm/arch-pxa/mtd-xip.h
new file mode 100644
index 000000000000..8704dbceb432
--- /dev/null
+++ b/include/asm-arm/arch-pxa/mtd-xip.h
@@ -0,0 +1,37 @@
+/*
+ * MTD primitives for XIP support. Architecture specific functions
+ *
+ * Do not include this file directly. It's included from linux/mtd/xip.h
+ * 
+ * Author:	Nicolas Pitre
+ * Created:	Nov 2, 2004
+ * Copyright:	(C) 2004 MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * $Id: xip.h,v 1.2 2004/12/01 15:49:10 nico Exp $
+ */
+
+#ifndef __ARCH_PXA_MTD_XIP_H__
+#define __ARCH_PXA_MTD_XIP_H__
+
+#include <asm/arch/pxa-regs.h>
+
+#define xip_irqpending()	(ICIP & ICMR)
+
+/* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
+#define xip_currtime()		(OSCR)
+#define xip_elapsed_since(x)	(signed)((OSCR - (x)) / 4)
+
+/*
+ * xip_cpu_idle() is used when waiting for a delay equal or larger than
+ * the system timer tick period.  This should put the CPU into idle mode
+ * to save power and to be woken up only when some interrupts are pending.
+ * As above, this should not rely upon standard kernel code.
+ */
+
+#define xip_cpu_idle()  asm volatile ("mcr p14, 0, %0, c7, c0, 0" :: "r" (1))
+
+#endif /* __ARCH_PXA_MTD_XIP_H__ */
diff --git a/include/asm-arm/arch-sa1100/mtd-xip.h b/include/asm-arm/arch-sa1100/mtd-xip.h
new file mode 100644
index 000000000000..80cfdac2b944
--- /dev/null
+++ b/include/asm-arm/arch-sa1100/mtd-xip.h
@@ -0,0 +1,26 @@
+/*
+ * MTD primitives for XIP support. Architecture specific functions
+ *
+ * Do not include this file directly. It's included from linux/mtd/xip.h
+ * 
+ * Author:	Nicolas Pitre
+ * Created:	Nov 2, 2004
+ * Copyright:	(C) 2004 MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * $Id: xip.h,v 1.2 2004/12/01 15:49:10 nico Exp $
+ */
+
+#ifndef __ARCH_SA1100_MTD_XIP_H__
+#define __ARCH_SA1100_MTD_XIP_H__
+
+#define xip_irqpending()	(ICIP & ICMR)
+
+/* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
+#define xip_currtime()		(OSCR)
+#define xip_elapsed_since(x)	(signed)((OSCR - (x)) / 4)
+
+#endif /* __ARCH_SA1100_MTD_XIP_H__ */
diff --git a/include/asm-arm/mtd-xip.h b/include/asm-arm/mtd-xip.h
new file mode 100644
index 000000000000..9eb127cc7db2
--- /dev/null
+++ b/include/asm-arm/mtd-xip.h
@@ -0,0 +1,26 @@
+/*
+ * MTD primitives for XIP support. Architecture specific functions
+ *
+ * Do not include this file directly. It's included from linux/mtd/xip.h
+ * 
+ * Author:	Nicolas Pitre
+ * Created:	Nov 2, 2004
+ * Copyright:	(C) 2004 MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * $Id: xip.h,v 1.2 2004/12/01 15:49:10 nico Exp $
+ */
+
+#ifndef __ARM_MTD_XIP_H__
+#define __ARM_MTD_XIP_H__
+
+#include <asm/hardware.h>
+#include <asm/arch/mtd-xip.h>
+
+/* fill instruction prefetch */
+#define xip_iprefetch() 	do { asm volatile (".rep 8; nop; .endr"); } while (0)
+
+#endif /* __ARM_MTD_XIP_H__ */
diff --git a/include/linux/mtd/xip.h b/include/linux/mtd/xip.h
index fc071125cbcc..7b7deef6b180 100644
--- a/include/linux/mtd/xip.h
+++ b/include/linux/mtd/xip.h
@@ -58,22 +58,16 @@
  * 		returned value is <= the real elapsed time.
  * 	note 2: this should be able to cope with a few seconds without
  * 		overflowing.
+ *
+ * xip_iprefetch()
+ *  
+ *      Macro to fill instruction prefetch
+ *	e.g. a series of nops:  asm volatile (".rep 8; nop; .endr"); 
  */
 
-#if defined(CONFIG_ARCH_SA1100) || defined(CONFIG_ARCH_PXA)
-
-#include <asm/hardware.h>
-#ifdef CONFIG_ARCH_PXA
-#include <asm/arch/pxa-regs.h>
-#endif
-
-#define xip_irqpending()	(ICIP & ICMR)
-
-/* we sample OSCR and convert desired delta to usec (1/4 ~= 1000000/3686400) */
-#define xip_currtime()		(OSCR)
-#define xip_elapsed_since(x)	(signed)((OSCR - (x)) / 4)
+#include <asm/mtd-xip.h>
 
-#else
+#ifndef xip_irqpending
 
 #warning "missing IRQ and timer primitives for XIP MTD support"
 #warning "some of the XIP MTD support code will be disabled"
@@ -85,16 +79,17 @@
 
 #endif
 
+#ifndef xip_iprefetch
+#define xip_iprefetch()		do { } while (0)
+#endif
+
 /*
  * xip_cpu_idle() is used when waiting for a delay equal or larger than
  * the system timer tick period.  This should put the CPU into idle mode
  * to save power and to be woken up only when some interrupts are pending.
- * As above, this should not rely upon standard kernel code.
+ * This should not rely upon standard kernel code.
  */
-
-#if defined(CONFIG_CPU_XSCALE)
-#define xip_cpu_idle()  asm volatile ("mcr p14, 0, %0, c7, c0, 0" :: "r" (1))
-#else
+#ifndef xip_cpu_idle
 #define xip_cpu_idle()  do { } while (0)
 #endif
 
-- 
cgit v1.2.3-59-g8ed1b


From cb2c0233755429037462e16ea0d5497a0092738c Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mark.fasheh@oracle.com>
Date: Thu, 7 Jul 2005 17:56:03 -0700
Subject: [PATCH] export generic_drop_inode() to modules

OCFS2 wants to mark an inode which has been orphaned by another node so
that during final iput it takes the correct path through the VFS and can
pass through the OCFS2 delete_inode callback.  Since i_nlink can get out of
date with other nodes, the best way I see to accomplish this is by clearing
i_nlink on those inodes at drop_inode time.  Other than this small amount
of work, nothing different needs to happen, so I think it would be cleanest
to be able to just call generic_drop_inode at the end of the OCFS2
drop_inode callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         | 4 +++-
 include/linux/fs.h | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 1f9a3a2b89bc..6d695037a0a3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1052,7 +1052,7 @@ static void generic_forget_inode(struct inode *inode)
  * inode when the usage count drops to zero, and
  * i_nlink is zero.
  */
-static void generic_drop_inode(struct inode *inode)
+void generic_drop_inode(struct inode *inode)
 {
 	if (!inode->i_nlink)
 		generic_delete_inode(inode);
@@ -1060,6 +1060,8 @@ static void generic_drop_inode(struct inode *inode)
 		generic_forget_inode(inode);
 }
 
+EXPORT_SYMBOL_GPL(generic_drop_inode);
+
 /*
  * Called when we're dropping the last reference
  * to an inode. 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 047bde30836a..302ec20838ca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1435,6 +1435,7 @@ extern struct inode * igrab(struct inode *);
 extern ino_t iunique(struct super_block *, ino_t);
 extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
+extern void generic_drop_inode(struct inode *inode);
 
 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data);
-- 
cgit v1.2.3-59-g8ed1b


From 79b9ce311e192e9a31fd9f3cf1ee4a4edf9e2650 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Date: Thu, 7 Jul 2005 17:56:04 -0700
Subject: [PATCH] print order information when OOM killing

Dump the current allocation order when OOM killing.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/sysrq.c | 2 +-
 include/linux/swap.h | 2 +-
 mm/oom_kill.c        | 4 ++--
 mm/page_alloc.c      | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index af79805b5576..12d563c648f7 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -228,7 +228,7 @@ static struct sysrq_key_op sysrq_term_op = {
 
 static void moom_callback(void *ignored)
 {
-	out_of_memory(GFP_KERNEL);
+	out_of_memory(GFP_KERNEL, 0);
 }
 
 static DECLARE_WORK(moom_work, moom_callback, NULL);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2343f999e6e1..c75954f2d868 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -148,7 +148,7 @@ struct swap_list_t {
 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
 
 /* linux/mm/oom_kill.c */
-extern void out_of_memory(unsigned int __nocast gfp_mask);
+extern void out_of_memory(unsigned int __nocast gfp_mask, int order);
 
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 59666d905f19..e20d559edbaf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -253,12 +253,12 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(unsigned int __nocast gfp_mask)
+void out_of_memory(unsigned int __nocast gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
 	task_t * p;
 
-	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+	printk("oom-killer: gfp_mask=0x%x, order=%d\n", gfp_mask, order);
 	/* print memory stats */
 	show_mem();
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3c9f7f881125..7fbd3ea8765c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -936,7 +936,7 @@ rebalance:
 				goto got_pg;
 		}
 
-		out_of_memory(gfp_mask);
+		out_of_memory(gfp_mask, order);
 		goto restart;
 	}
 
-- 
cgit v1.2.3-59-g8ed1b


From cf36680887d6d942d2119c1ff1dfb2428b0f21f4 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 7 Jul 2005 17:56:13 -0700
Subject: [PATCH] move ioprio syscalls into syscalls.h

- Make ioprio syscalls return long, like set/getpriority syscalls.
- Move function prototypes into syscalls.h so we can pick them up in the
  32/64bit compat code.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ioprio.c              | 4 ++--
 include/linux/ioprio.h   | 3 ---
 include/linux/syscalls.h | 3 +++
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ioprio.c b/fs/ioprio.c
index 663e420636d6..97e1f088ba00 100644
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -43,7 +43,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio)
 	return 0;
 }
 
-asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
+asmlinkage long sys_ioprio_set(int which, int who, int ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
 	int data = IOPRIO_PRIO_DATA(ioprio);
@@ -115,7 +115,7 @@ asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
 	return ret;
 }
 
-asmlinkage int sys_ioprio_get(int which, int who)
+asmlinkage long sys_ioprio_get(int which, int who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index 8a453a0b5e4b..88d5961f7a3f 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -34,9 +34,6 @@ enum {
  */
 #define IOPRIO_BE_NR	(8)
 
-asmlinkage int sys_ioprio_set(int, int, int);
-asmlinkage int sys_ioprio_get(int, int);
-
 enum {
 	IOPRIO_WHO_PROCESS = 1,
 	IOPRIO_WHO_PGRP,
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 52830b6d94e5..425f58c8ea4a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -506,4 +506,7 @@ asmlinkage long sys_request_key(const char __user *_type,
 asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
 			   unsigned long arg4, unsigned long arg5);
 
+asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
+asmlinkage long sys_ioprio_get(int which, int who);
+
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From e00d9967e3addea86dded46deefc5daec5d52e5a Mon Sep 17 00:00:00 2001
From: Bernard Blackham <bernard@blackham.com.au>
Date: Thu, 7 Jul 2005 17:56:42 -0700
Subject: [PATCH] pm: fix u32 vs. pm_message_t confusion in cpufreq

Fix u32 vs pm_message_t confusion in cpufreq.

Signed-off-by: Bernard Blackham <bernard@blackham.com.au>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc/platforms/pmac_cpufreq.c | 2 +-
 drivers/cpufreq/cpufreq.c         | 4 ++--
 include/linux/cpufreq.h           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/platforms/pmac_cpufreq.c b/arch/ppc/platforms/pmac_cpufreq.c
index 5fdd4f607a40..c0605244edda 100644
--- a/arch/ppc/platforms/pmac_cpufreq.c
+++ b/arch/ppc/platforms/pmac_cpufreq.c
@@ -452,7 +452,7 @@ static u32 __pmac read_gpio(struct device_node *np)
 	return offset;
 }
 
-static int __pmac pmac_cpufreq_suspend(struct cpufreq_policy *policy, u32 state)
+static int __pmac pmac_cpufreq_suspend(struct cpufreq_policy *policy, pm_message_t pmsg)
 {
 	/* Ok, this could be made a bit smarter, but let's be robust for now. We
 	 * always force a speed change to high speed before sleep, to make sure
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index bf62dfe4976a..7a7859dd0d98 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -869,7 +869,7 @@ EXPORT_SYMBOL(cpufreq_get);
  *	cpufreq_suspend - let the low level driver prepare for suspend
  */
 
-static int cpufreq_suspend(struct sys_device * sysdev, u32 state)
+static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg)
 {
 	int cpu = sysdev->id;
 	unsigned int ret = 0;
@@ -897,7 +897,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, u32 state)
 	}
 
 	if (cpufreq_driver->suspend) {
-		ret = cpufreq_driver->suspend(cpu_policy, state);
+		ret = cpufreq_driver->suspend(cpu_policy, pmsg);
 		if (ret) {
 			printk(KERN_ERR "cpufreq: suspend failed in ->suspend "
 					"step on CPU %u\n", cpu_policy->cpu);
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 927daa86c9b3..ff7f80f48df1 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -201,7 +201,7 @@ struct cpufreq_driver {
 
 	/* optional */
 	int	(*exit)		(struct cpufreq_policy *policy);
-	int	(*suspend)	(struct cpufreq_policy *policy, u32 state);
+	int	(*suspend)	(struct cpufreq_policy *policy, pm_message_t pmsg);
 	int	(*resume)	(struct cpufreq_policy *policy);
 	struct freq_attr	**attr;
 };
-- 
cgit v1.2.3-59-g8ed1b


From a39722034ae37f80a1803bf781fe3fe1b03e20bc Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Thu, 7 Jul 2005 17:56:56 -0700
Subject: [PATCH] page_uptodate locking scalability

Use a bit spin lock in the first buffer of the page to synchronise asynch
IO buffer completions, instead of the global page_uptodate_lock, which is
showing some scalabilty problems.

Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                 | 25 +++++++++++++++++--------
 include/linux/buffer_head.h |  3 +++
 2 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 561e63a14966..6a25d7df89b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -513,8 +513,8 @@ static void free_more_memory(void)
  */
 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 {
-	static DEFINE_SPINLOCK(page_uptodate_lock);
 	unsigned long flags;
+	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 	int page_uptodate = 1;
@@ -536,7 +536,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	 * two buffer heads end IO at almost the same time and both
 	 * decide that the page is now completely done.
 	 */
-	spin_lock_irqsave(&page_uptodate_lock, flags);
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -549,7 +551,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -561,7 +564,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
 	return;
 
 still_busy:
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	return;
 }
 
@@ -572,8 +576,8 @@ still_busy:
 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 {
 	char b[BDEVNAME_SIZE];
-	static DEFINE_SPINLOCK(page_uptodate_lock);
 	unsigned long flags;
+	struct buffer_head *first;
 	struct buffer_head *tmp;
 	struct page *page;
 
@@ -594,7 +598,10 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		SetPageError(page);
 	}
 
-	spin_lock_irqsave(&page_uptodate_lock, flags);
+	first = page_buffers(page);
+	local_irq_save(flags);
+	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
 	tmp = bh->b_this_page;
@@ -605,12 +612,14 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate)
 		}
 		tmp = tmp->b_this_page;
 	}
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	spin_unlock_irqrestore(&page_uptodate_lock, flags);
+	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
+	local_irq_restore(flags);
 	return;
 }
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 802c91e9b3da..90828493791f 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -19,6 +19,9 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
+	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
+			  * IO completion of other buffers in the page
+			  */
 
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
-- 
cgit v1.2.3-59-g8ed1b


From 0db925af1db5f3dfe1691c35b39496e2baaff9c9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 7 Jul 2005 17:56:58 -0700
Subject: [PATCH] propagate __nocast annotations

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/gfp.h    |  4 ++--
 include/linux/slab.h   |  4 ++--
 include/linux/string.h |  2 +-
 mm/mempool.c           |  2 +-
 mm/slab.c              | 12 +++++++-----
 5 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 8d6bf608b199..7c7400137e97 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -12,8 +12,8 @@ struct vm_area_struct;
  * GFP bitmasks..
  */
 /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
-#define __GFP_DMA	0x01
-#define __GFP_HIGHMEM	0x02
+#define __GFP_DMA	0x01u
+#define __GFP_HIGHMEM	0x02u
 
 /*
  * Action modifiers - doesn't change the zoning
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 76cf7e60216c..4c8e552471b0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -65,7 +65,7 @@ extern void *kmem_cache_alloc(kmem_cache_t *, unsigned int __nocast);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 extern const char *kmem_cache_name(kmem_cache_t *);
-extern kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags);
+extern kmem_cache_t *kmem_find_general_cachep(size_t size, unsigned int __nocast gfpflags);
 
 /* Size description struct for general caches. */
 struct cache_sizes {
@@ -105,7 +105,7 @@ extern unsigned int ksize(const void *);
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node(kmem_cache_t *, int flags, int node);
-extern void *kmalloc_node(size_t size, int flags, int node);
+extern void *kmalloc_node(size_t size, unsigned int __nocast flags, int node);
 #else
 static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int node)
 {
diff --git a/include/linux/string.h b/include/linux/string.h
index 93994c613095..dab2652acbd8 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -88,7 +88,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
-extern char *kstrdup(const char *s, int gfp);
+extern char *kstrdup(const char *s, unsigned int __nocast gfp);
 
 #ifdef __cplusplus
 }
diff --git a/mm/mempool.c b/mm/mempool.c
index 9a72f7d918fa..65f2957b8d51 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -205,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
 	void *element;
 	unsigned long flags;
 	wait_queue_t wait;
-	int gfp_temp;
+	unsigned int gfp_temp;
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 
diff --git a/mm/slab.c b/mm/slab.c
index e57abd45eede..c9e706db4634 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -584,7 +584,8 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
+static inline kmem_cache_t *__find_general_cachep(size_t size,
+						unsigned int __nocast gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -608,7 +609,8 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags)
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags)
+kmem_cache_t *kmem_find_general_cachep(size_t size,
+		unsigned int __nocast gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
@@ -2100,7 +2102,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
 #if DEBUG
 static void *
 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			unsigned long flags, void *objp, void *caller)
+			unsigned int __nocast flags, void *objp, void *caller)
 {
 	if (!objp)	
 		return objp;
@@ -2442,7 +2444,7 @@ got_slabp:
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *kmalloc_node(size_t size, int flags, int node)
+void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
 {
 	kmem_cache_t *cachep;
 
@@ -3094,7 +3096,7 @@ unsigned int ksize(const void *objp)
  * @s: the string to duplicate
  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  */
-char *kstrdup(const char *s, int gfp)
+char *kstrdup(const char *s, unsigned int __nocast gfp)
 {
 	size_t len;
 	char *buf;
-- 
cgit v1.2.3-59-g8ed1b


From 6c036527a630720063b67d9a65455e8caca2c8fa Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Thu, 7 Jul 2005 17:56:59 -0700
Subject: [PATCH] mostly_read data section

Add a new section called ".data.read_mostly" for data items that are read
frequently and rarely written to like cpumaps etc.

If these maps are placed in the .data section then these frequenly read
items may end up in cachelines with data is is frequently updated.  In that
case all processors in an SMP system must needlessly reload the cachelines
again and again containing elements of those frequently used variables.

The ability to share these cachelines will allow each cpu in an SMP system
to keep local copies of those shared cachelines thereby optimizing
performance.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shobhit Dayal <shobhit@calsoftinc.com>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel.c         |  2 +-
 arch/i386/kernel/smpboot.c           | 18 +++++++++---------
 arch/i386/kernel/time.c              |  2 +-
 arch/i386/kernel/timers/timer_hpet.c |  4 ++--
 arch/i386/kernel/vmlinux.lds.S       |  3 +++
 arch/x86_64/kernel/vmlinux.lds.S     |  4 ++++
 drivers/char/random.c                |  2 +-
 fs/bio.c                             |  2 +-
 include/linux/cache.h                |  6 ++++++
 kernel/profile.c                     |  4 ++--
 lib/radix-tree.c                     |  2 +-
 11 files changed, 31 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index 96a75d045835..a2c33c1a46c5 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -25,7 +25,7 @@ extern int trap_init_f00f_bug(void);
 /*
  * Alignment at which movsl is preferred for bulk memory copies.
  */
-struct movsl_mask movsl_mask;
+struct movsl_mask movsl_mask __read_mostly;
 #endif
 
 void __devinit early_intel_workaround(struct cpuinfo_x86 *c)
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index d66bf489a2e9..8ac8e9fd5614 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -68,21 +68,21 @@ EXPORT_SYMBOL(smp_num_siblings);
 #endif
 
 /* Package ID of each logical CPU */
-int phys_proc_id[NR_CPUS] = {[0 ... NR_CPUS-1] = BAD_APICID};
+int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 EXPORT_SYMBOL(phys_proc_id);
 
 /* Core ID of each logical CPU */
-int cpu_core_id[NR_CPUS] = {[0 ... NR_CPUS-1] = BAD_APICID};
+int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
 EXPORT_SYMBOL(cpu_core_id);
 
-cpumask_t cpu_sibling_map[NR_CPUS];
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_sibling_map);
 
-cpumask_t cpu_core_map[NR_CPUS];
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
 /* bitmap of online cpus */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
 cpumask_t cpu_callin_map;
@@ -100,7 +100,7 @@ static int __devinitdata tsc_sync_disabled;
 struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(cpu_data);
 
-u8 x86_cpu_to_apicid[NR_CPUS] =
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly =
 			{ [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
@@ -550,10 +550,10 @@ extern struct {
 #ifdef CONFIG_NUMA
 
 /* which logical CPUs are on which nodes */
-cpumask_t node_2_cpu_mask[MAX_NUMNODES] =
+cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly =
 				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
 /* which node each logical CPU is on */
-int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 };
+int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
 EXPORT_SYMBOL(cpu_2_node);
 
 /* set up a mapping between cpu and node. */
@@ -581,7 +581,7 @@ static inline void unmap_cpu_to_node(int cpu)
 
 #endif /* CONFIG_NUMA */
 
-u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 
 static void map_cpu_to_logical_apicid(void)
 {
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 2854c357377f..0ee9dee8af06 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(rtc_lock);
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
-struct timer_opts *cur_timer = &timer_none;
+struct timer_opts *cur_timer __read_mostly = &timer_none;
 
 /*
  * This is a special lock that is owned by the CPU and holds the index
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index d766e0963ac1..ef8dac5dd33b 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -18,7 +18,7 @@
 #include "mach_timer.h"
 #include <asm/hpet.h>
 
-static unsigned long hpet_usec_quotient;	/* convert hpet clks to usec */
+static unsigned long __read_mostly hpet_usec_quotient;	/* convert hpet clks to usec */
 static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
@@ -180,7 +180,7 @@ static int __init init_hpet(char* override)
 /************************************************************/
 
 /* tsc timer_opts struct */
-static struct timer_opts timer_hpet = {
+static struct timer_opts timer_hpet __read_mostly = {
 	.name = 		"hpet",
 	.mark_offset =		mark_offset_hpet,
 	.get_offset =		get_offset_hpet,
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 7e01a528a83a..761972f8cb6c 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -57,6 +57,9 @@ SECTIONS
 	*(.data.cacheline_aligned)
   }
 
+  /* rarely changed data like cpu maps */
+  . = ALIGN(32);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) }
   _edata = .;			/* End of data section */
 
   . = ALIGN(THREAD_SIZE);	/* init_task */
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 73389f51c4e5..61c12758ca70 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -56,6 +56,10 @@ SECTIONS
   .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
 	*(.data.cacheline_aligned)
   }
+  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+  	*(.data.read_mostly)
+  }
 
 #define VSYSCALL_ADDR (-10*1024*1024)
 #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
diff --git a/drivers/char/random.c b/drivers/char/random.c
index 460b5d475edd..6b11d6b2129f 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -271,7 +271,7 @@ static int random_write_wakeup_thresh = 128;
  * samples to avoid wasting CPU time and reduce lock contention.
  */
 
-static int trickle_thresh = INPUT_POOL_WORDS * 28;
+static int trickle_thresh __read_mostly = INPUT_POOL_WORDS * 28;
 
 static DEFINE_PER_CPU(int, trickle_count) = 0;
 
diff --git a/fs/bio.c b/fs/bio.c
index 3a1472acc361..ca8f7a850fe3 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -52,7 +52,7 @@ struct biovec_slab {
  */
 
 #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] = {
+static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
 };
 #undef BV
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 4d767b93738a..2b66a36d85f0 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -13,6 +13,12 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
+#ifdef CONFIG_X86
+#define __read_mostly __attribute__((__section__(".data.read_mostly")))
+#else
+#define __read_mostly
+#endif
+
 #ifndef ____cacheline_aligned
 #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES)))
 #endif
diff --git a/kernel/profile.c b/kernel/profile.c
index ad8cbb75ffa2..f89248e6d704 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -35,11 +35,11 @@ struct profile_hit {
 #define NR_PROFILE_GRP		(NR_PROFILE_HIT/PROFILE_GRPSZ)
 
 /* Oprofile timer tick hook */
-int (*timer_hook)(struct pt_regs *);
+int (*timer_hook)(struct pt_regs *) __read_mostly;
 
 static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
-static int prof_on;
+static int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 04d664377f2c..10bed1c8c3c3 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -58,7 +58,7 @@ struct radix_tree_path {
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 #define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
 
-static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH];
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
 
 /*
  * Radix tree node cache.
-- 
cgit v1.2.3-59-g8ed1b


From 1ce88cf466f7b6078b14d67d186a3d7c19dd5609 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 Jul 2005 17:57:24 -0700
Subject: [PATCH] namespace.c: fix race in mark_mounts_for_expiry()

This patch fixes a race found by Ram in mark_mounts_for_expiry() in
fs/namespace.c.

The bug can only be triggered with simultaneous exiting of a process having
a private namespace, and expiry of a mount from within that namespace.
It's practically impossible to trigger, and I haven't even tried.  But
still, a bug is a bug.

The race happens when put_namespace() is called by another task, while
mark_mounts_for_expiry() is between atomic_read() and get_namespace().  In
that case get_namespace() will be called on an already dead namespace with
unforeseeable results.

The solution was suggested by Al Viro, with his own words:

      Instead of screwing with atomic_read() in there, why don't we
      simply do the following:
      	a) atomic_dec_and_lock() in put_namespace()
      	b) __put_namespace() called without dropping lock
      	c) the first thing done by __put_namespace would be
      struct vfsmount *root = namespace->root;
      namespace->root = NULL;
      spin_unlock(...);
      ....
      umount_tree(root);
      ...
      	d) check in mark_... would be simply namespace && namespace->root.

      And we are all set; no screwing around with atomic_read(), no magic
      at all.  Dying namespace gets NULL ->root.
      All changes of ->root happen under spinlock.
      If under a spinlock we see non-NULL ->mnt_namespace, it won't be
      freed until we drop the lock (we will set ->mnt_namespace to NULL
      under that lock before we get to freeing namespace).
      If under a spinlock we see non-NULL ->mnt_namespace and
      ->mnt_namespace->root, we can grab a reference to namespace and be
      sure that it won't go away.

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c            | 7 +++++--
 include/linux/namespace.h | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index a0d0ef1f1a48..9d17541ebafa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -869,7 +869,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		/* don't do anything if the namespace is dead - all the
 		 * vfsmounts from it are going away anyway */
 		namespace = mnt->mnt_namespace;
-		if (!namespace || atomic_read(&namespace->count) <= 0)
+		if (!namespace || !namespace->root)
 			continue;
 		get_namespace(namespace);
 
@@ -1450,9 +1450,12 @@ void __init mnt_init(unsigned long mempages)
 
 void __put_namespace(struct namespace *namespace)
 {
+	struct vfsmount *root = namespace->root;
+	namespace->root = NULL;
+	spin_unlock(&vfsmount_lock);
 	down_write(&namespace->sem);
 	spin_lock(&vfsmount_lock);
-	umount_tree(namespace->root);
+	umount_tree(root);
 	spin_unlock(&vfsmount_lock);
 	up_write(&namespace->sem);
 	kfree(namespace);
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 697991b69f9b..0e5a86f13b2f 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -17,7 +17,8 @@ extern void __put_namespace(struct namespace *namespace);
 
 static inline void put_namespace(struct namespace *namespace)
 {
-	if (atomic_dec_and_test(&namespace->count))
+	if (atomic_dec_and_lock(&namespace->count, &vfsmount_lock))
+		/* releases vfsmount_lock */
 		__put_namespace(namespace);
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 55e700b924f9e0ba24e3a071d1097d050b05abe6 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 Jul 2005 17:57:30 -0700
Subject: [PATCH] namespace: rename mnt_fslink to mnt_expire

This patch renames vfsmount->mnt_fslink to something a little more
descriptive: vfsmount->mnt_expire.

Signed-off-by: Mike Waychison <michael.waychison@sun.com>
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c        | 24 ++++++++++++------------
 include/linux/mount.h |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index b168dc37eaab..587eb0d707ee 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -61,7 +61,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_child);
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
-		INIT_LIST_HEAD(&mnt->mnt_fslink);
+		INIT_LIST_HEAD(&mnt->mnt_expire);
 		if (name) {
 			int size = strlen(name)+1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -165,8 +165,8 @@ clone_mnt(struct vfsmount *old, struct dentry *root)
 		/* stick the duplicate mount on the same expiry list
 		 * as the original if that was on one */
 		spin_lock(&vfsmount_lock);
-		if (!list_empty(&old->mnt_fslink))
-			list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+		if (!list_empty(&old->mnt_expire))
+			list_add(&mnt->mnt_expire, &old->mnt_expire);
 		spin_unlock(&vfsmount_lock);
 	}
 	return mnt;
@@ -351,7 +351,7 @@ static void umount_tree(struct vfsmount *mnt)
 	while (!list_empty(&kill)) {
 		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
 		list_del_init(&mnt->mnt_list);
-		list_del_init(&mnt->mnt_fslink);
+		list_del_init(&mnt->mnt_expire);
 		if (mnt->mnt_parent == mnt) {
 			spin_unlock(&vfsmount_lock);
 		} else {
@@ -645,7 +645,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 	if (mnt) {
 		/* stop bind mounts from expiring */
 		spin_lock(&vfsmount_lock);
-		list_del_init(&mnt->mnt_fslink);
+		list_del_init(&mnt->mnt_expire);
 		spin_unlock(&vfsmount_lock);
 
 		err = graft_tree(mnt, nd);
@@ -744,7 +744,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name)
 
 	/* if the mount is moved, it should no longer be expire
 	 * automatically */
-	list_del_init(&old_nd.mnt->mnt_fslink);
+	list_del_init(&old_nd.mnt->mnt_expire);
 out2:
 	spin_unlock(&vfsmount_lock);
 out1:
@@ -814,7 +814,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 	if (err == 0 && fslist) {
 		/* add to the specified expiration list */
 		spin_lock(&vfsmount_lock);
-		list_add_tail(&newmnt->mnt_fslink, fslist);
+		list_add_tail(&newmnt->mnt_expire, fslist);
 		spin_unlock(&vfsmount_lock);
 	}
 
@@ -869,7 +869,7 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
 		 * Someone brought it back to life whilst we didn't have any
 		 * locks held so return it to the expiration list
 		 */
-		list_add_tail(&mnt->mnt_fslink, mounts);
+		list_add_tail(&mnt->mnt_expire, mounts);
 		spin_unlock(&vfsmount_lock);
 	}
 }
@@ -896,13 +896,13 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 	 * - still marked for expiry (marked on the last call here; marks are
 	 *   cleared by mntput())
 	 */
-	list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) {
+	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
 		    atomic_read(&mnt->mnt_count) != 1)
 			continue;
 
 		mntget(mnt);
-		list_move(&mnt->mnt_fslink, &graveyard);
+		list_move(&mnt->mnt_expire, &graveyard);
 	}
 
 	/*
@@ -912,8 +912,8 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 	 * - dispose of the corpse
 	 */
 	while (!list_empty(&graveyard)) {
-		mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
-		list_del_init(&mnt->mnt_fslink);
+		mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire);
+		list_del_init(&mnt->mnt_expire);
 
 		/* don't do anything if the namespace is dead - all the
 		 * vfsmounts from it are going away anyway */
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 8b8d3b9beefd..196d2d6de4a3 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -34,7 +34,7 @@ struct vfsmount
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	char *mnt_devname;		/* Name of device e.g. /dev/dsk/hda1 */
 	struct list_head mnt_list;
-	struct list_head mnt_fslink;	/* link in fs-specific expiry list */
+	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct namespace *mnt_namespace; /* containing namespace */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 751c404b8f63e8199d5f2f8f2bcfd69b41d11caa Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 Jul 2005 17:57:30 -0700
Subject: [PATCH] namespace: rename _mntput to mntput_no_expire

This patch renames _mntput() to something a little more descriptive:
mntput_no_expire().

Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c            | 2 +-
 include/linux/mount.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index fa8df81ce8ca..1d93cb4f7c5f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -314,7 +314,7 @@ void path_release(struct nameidata *nd)
 void path_release_on_umount(struct nameidata *nd)
 {
 	dput(nd->dentry);
-	_mntput(nd->mnt);
+	mntput_no_expire(nd->mnt);
 }
 
 /*
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 196d2d6de4a3..74b4727a4e30 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -47,7 +47,7 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 
 extern void __mntput(struct vfsmount *mnt);
 
-static inline void _mntput(struct vfsmount *mnt)
+static inline void mntput_no_expire(struct vfsmount *mnt)
 {
 	if (mnt) {
 		if (atomic_dec_and_test(&mnt->mnt_count))
@@ -59,7 +59,7 @@ static inline void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		mnt->mnt_expiry_mark = 0;
-		_mntput(mnt);
+		mntput_no_expire(mnt);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From a6ccbbb8865101d83c2e716f08feae1da1c48584 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 7 Jul 2005 17:59:11 -0700
Subject: [PATCH] nfsd4: fix sync'ing of recovery directory

We need to fsync the recovery directory after writing to it, but we weren't
doing this correctly.  (For example, we weren't taking the i_sem when calling
->fsync().)

Just reuse the existing nfsd fsync code instead.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4recover.c     | 29 ++++++++---------------------
 fs/nfsd/vfs.c             |  2 +-
 include/linux/nfsd/nfsd.h |  1 +
 3 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 095f1740f3ae..bb40083b6b7d 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -119,25 +119,12 @@ out:
 	return status;
 }
 
-static int
-nfsd4_rec_fsync(struct dentry *dentry)
+static void
+nfsd4_sync_rec_dir(void)
 {
-	struct file *filp;
-	int status = nfs_ok;
-
-	dprintk("NFSD: nfs4_fsync_rec_dir\n");
-	filp = dentry_open(dget(dentry), mntget(rec_dir.mnt), O_RDWR);
-	if (IS_ERR(filp)) {
-		status = PTR_ERR(filp);
-		goto out;
-	}
-	if (filp->f_op && filp->f_op->fsync)
-		status = filp->f_op->fsync(filp, filp->f_dentry, 0);
-	fput(filp);
-out:
-	if (status)
-		printk("nfsd4: unable to sync recovery directory\n");
-	return status;
+	down(&rec_dir.dentry->d_inode->i_sem);
+	nfsd_sync_dir(rec_dir.dentry);
+	up(&rec_dir.dentry->d_inode->i_sem);
 }
 
 int
@@ -176,7 +163,7 @@ out_unlock:
 	up(&rec_dir.dentry->d_inode->i_sem);
 	if (status == 0) {
 		clp->cl_firststate = 1;
-		status = nfsd4_rec_fsync(rec_dir.dentry);
+		nfsd4_sync_rec_dir();
 	}
 	nfs4_reset_user(uid, gid);
 	dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status);
@@ -331,7 +318,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
 	status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
 	nfs4_reset_user(uid, gid);
 	if (status == 0)
-		status = nfsd4_rec_fsync(rec_dir.dentry);
+		nfsd4_sync_rec_dir();
 	if (status)
 		printk("NFSD: Failed to remove expired client state directory"
 				" %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -362,7 +349,7 @@ nfsd4_recdir_purge_old(void) {
 		return;
 	status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old);
 	if (status == 0)
-		status = nfsd4_rec_fsync(rec_dir.dentry);
+		nfsd4_sync_rec_dir();
 	if (status)
 		printk("nfsd4: failed to purge old clients from recovery"
 			" directory %s\n", rec_dir.dentry->d_name.name);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index be24ead89d94..5e0bf3917607 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -733,7 +733,7 @@ nfsd_sync(struct file *filp)
 	up(&inode->i_sem);
 }
 
-static void
+void
 nfsd_sync_dir(struct dentry *dp)
 {
 	nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 5791dfd30dd0..c2da1b62d416 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -124,6 +124,7 @@ int		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 
 int		nfsd_notify_change(struct inode *, struct iattr *);
 int		nfsd_permission(struct svc_export *, struct dentry *, int);
+void		nfsd_sync_dir(struct dentry *dp);
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
 #ifdef CONFIG_NFSD_V2_ACL
-- 
cgit v1.2.3-59-g8ed1b


From 7fb64cee34f5dc743f697041717cafda8a94b5ac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 7 Jul 2005 17:59:20 -0700
Subject: [PATCH] nfsd4: seqid comments

Add some comments on the use of so_seqid, in an attempt to avoid some of the
confusion outlined in the previous patch....

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4xdr.c          | 8 ++++----
 include/linux/nfsd/state.h | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 5207068cde1a..1515c5b8096f 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1210,10 +1210,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 	save = resp->p;
 
 /*
- * Routine for encoding the result of a
- * "seqid-mutating" NFSv4 operation.  This is
- * where seqids are incremented, and the
- * replay cache is filled.
+ * Routine for encoding the result of a "seqid-mutating" NFSv4 operation.  This
+ * is where sequence id's are incremented, and the replay cache is filled.
+ * Note that we increment sequence id's here, at the last moment, so we're sure
+ * we know whether the error to be returned is a sequence id mutating error.
  */
 
 #define ENCODE_SEQID_OP_TAIL(stateowner) do {			\
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index a84a3fa99be1..2d19431f47ea 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -203,7 +203,9 @@ struct nfs4_stateowner {
 	int			so_is_open_owner; /* 1=openowner,0=lockowner */
 	u32                     so_id;
 	struct nfs4_client *    so_client;
-	u32                     so_seqid;    
+	/* after increment in ENCODE_SEQID_OP_TAIL, represents the next
+	 * sequence id expected from the client: */
+	u32                     so_seqid;
 	struct xdr_netobj       so_owner;     /* open owner name */
 	int                     so_confirmed; /* successful OPEN_CONFIRM? */
 	struct nfs4_replay	so_replay;
-- 
cgit v1.2.3-59-g8ed1b


From b700949b781480819e53bdc38a53f053226dd75e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 7 Jul 2005 17:59:23 -0700
Subject: [PATCH] nfsd4: return better error on io incompatible with open mode

from RFC 3530:
"Share reservations are established by OPEN operations and by their
nature are mandatory in that when the OPEN denies READ or WRITE
operations, that denial results in such operations being rejected
with error NFS4ERR_LOCKED."

(Note that share_denied is really only a legal error for OPEN.)

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c       | 2 +-
 include/linux/nfsd/nfsd.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b96714ae3dd7..3647c942915e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1295,7 +1295,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
 	fp = find_file(ino);
 	if (!fp)
 		return nfs_ok;
-	ret = nfserr_share_denied;
+	ret = nfserr_locked;
 	/* Search for conflicting share reservations */
 	list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
 		if (test_bit(deny_type, &stp->st_deny_bmap) ||
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index c2da1b62d416..6d5a24f3fc6d 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -231,6 +231,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_reclaim_bad	__constant_htonl(NFSERR_RECLAIM_BAD)
 #define	nfserr_badname		__constant_htonl(NFSERR_BADNAME)
 #define	nfserr_cb_path_down	__constant_htonl(NFSERR_CB_PATH_DOWN)
+#define	nfserr_locked		__constant_htonl(NFSERR_LOCKED)
 
 /* error codes for internal use */
 /* if a request fails due to kmalloc failure, it gets dropped.
-- 
cgit v1.2.3-59-g8ed1b


From 4c4cd222ee329025840bc2f8cebf71d36c62440c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 7 Jul 2005 17:59:27 -0700
Subject: [PATCH] nfsd4: check lock type against openmode.

We shouldn't be allowing, e.g., write locks on files not open for read.  To
enforce this, we add a pointer from the lock stateid back to the open stateid
it came from, so that the check will continue to be correct even after the
open is upgraded or downgraded.

Signed-off-by: Andy Adamson <andros@citi.umich.edu>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfsd/nfs4state.c        | 49 +++++++++++++++++++++++++++++++---------------
 include/linux/nfsd/state.h |  5 +++++
 2 files changed, 38 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 59b214f01b6d..b83f8fb441e1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1160,6 +1160,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
 	stp->st_deny_bmap = 0;
 	__set_bit(open->op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
+	stp->st_openstp = NULL;
 }
 
 static void
@@ -2158,12 +2159,18 @@ out:
 	return status;
 }
 
+static inline int
+setlkflg (int type)
+{
+	return (type == NFS4_READW_LT || type == NFS4_READ_LT) ?
+		RD_STATE : WR_STATE;
+}
 
 /* 
  * Checks for sequence id mutating operations. 
  */
 static int
-nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, clientid_t *lockclid)
+nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
 {
 	struct nfs4_stateid *stp;
 	struct nfs4_stateowner *sop;
@@ -2201,21 +2208,31 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
 		goto check_replay;
 	}
 
-	/* for new lock stateowners:
-	 * check that the lock->v.new.open_stateid
-	 * refers to an open stateowner
-	 *
-	 * check that the lockclid (nfs4_lock->v.new.clientid) is the same
-	 * as the open_stateid->st_stateowner->so_client->clientid
-	 */
-	if (lockclid) {
+	if (lock) {
 		struct nfs4_stateowner *sop = stp->st_stateowner;
+		clientid_t *lockclid = &lock->v.new.clientid;
 		struct nfs4_client *clp = sop->so_client;
+		int lkflg = 0;
+		int status;
+
+		lkflg = setlkflg(lock->lk_type);
+
+		if (lock->lk_is_new) {
+                       if (!sop->so_is_open_owner)
+			       return nfserr_bad_stateid;
+                       if (!cmp_clid(&clp->cl_clientid, lockclid))
+			       return nfserr_bad_stateid;
+                       /* stp is the open stateid */
+                       status = nfs4_check_openmode(stp, lkflg);
+                       if (status)
+			       return status;
+               } else {
+                       /* stp is the lock stateid */
+                       status = nfs4_check_openmode(stp->st_openstp, lkflg);
+                       if (status)
+			       return status;
+               }
 
-		if (!sop->so_is_open_owner)
-			return nfserr_bad_stateid;
-		if (!cmp_clid(&clp->cl_clientid, lockclid))
-			return nfserr_bad_stateid;
 	}
 
 	if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) {
@@ -2642,6 +2659,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc
 	stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */
 	stp->st_access_bmap = open_stp->st_access_bmap;
 	stp->st_deny_bmap = open_stp->st_deny_bmap;
+	stp->st_openstp = open_stp;
 
 out:
 	return stp;
@@ -2697,8 +2715,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				        lock->lk_new_open_seqid,
 		                        &lock->lk_new_open_stateid,
 		                        CHECK_FH | OPEN_STATE,
-		                        &open_sop, &open_stp,
-					&lock->v.new.clientid);
+		                        &open_sop, &open_stp, lock);
 		if (status)
 			goto out;
 		/* create lockowner and lock stateid */
@@ -2726,7 +2743,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 				       lock->lk_old_lock_seqid, 
 				       &lock->lk_old_lock_stateid, 
 				       CHECK_FH | LOCK_STATE, 
-				       &lock->lk_stateowner, &lock_stp, NULL);
+				       &lock->lk_stateowner, &lock_stp, lock);
 		if (status)
 			goto out;
 	}
diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h
index 2d19431f47ea..8bf23cf8b603 100644
--- a/include/linux/nfsd/state.h
+++ b/include/linux/nfsd/state.h
@@ -237,6 +237,10 @@ struct nfs4_file {
 *       st_perlockowner: (open stateid) list of lock nfs4_stateowners
 * 	st_access_bmap: used only for open stateid
 * 	st_deny_bmap: used only for open stateid
+*	st_openstp: open stateid lock stateid was derived from
+*
+* XXX: open stateids and lock stateids have diverged sufficiently that
+* we should consider defining separate structs for the two cases.
 */
 
 struct nfs4_stateid {
@@ -250,6 +254,7 @@ struct nfs4_stateid {
 	struct file                 * st_vfs_file;
 	unsigned long                 st_access_bmap;
 	unsigned long                 st_deny_bmap;
+	struct nfs4_stateid         * st_openstp;
 };
 
 /* flags for preprocess_seqid_op() */
-- 
cgit v1.2.3-59-g8ed1b


From 86a76caf8705e3524e15f343f3c4806939a06dc8 Mon Sep 17 00:00:00 2001
From: Victor Fusco <victor@cetuc.puc-rio.br>
Date: Fri, 8 Jul 2005 14:57:47 -0700
Subject: [NET]: Fix sparse warnings

From: Victor Fusco <victor@cetuc.puc-rio.br>

Fix the sparse warning "implicit cast to nocast type"

Signed-off-by: Victor Fusco <victor@cetuc.puc-rio.br>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 29 ++++++++++++++++++-----------
 include/net/sock.h     | 18 +++++++++++-------
 include/net/tcp.h      |  3 ++-
 net/core/dev.c         |  2 +-
 net/core/skbuff.c      | 17 ++++++++++-------
 net/core/sock.c        | 11 +++++++----
 net/ipv4/tcp_output.c  |  2 +-
 7 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 14b950413495..5d4a990d5577 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -300,20 +300,26 @@ struct sk_buff {
 #include <asm/system.h>
 
 extern void	       __kfree_skb(struct sk_buff *skb);
-extern struct sk_buff *alloc_skb(unsigned int size, int priority);
+extern struct sk_buff *alloc_skb(unsigned int size,
+				 unsigned int __nocast priority);
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
-					    unsigned int size, int priority);
+					    unsigned int size,
+					    unsigned int __nocast priority);
 extern void	       kfree_skbmem(struct sk_buff *skb);
-extern struct sk_buff *skb_clone(struct sk_buff *skb, int priority);
-extern struct sk_buff *skb_copy(const struct sk_buff *skb, int priority);
-extern struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask);
+extern struct sk_buff *skb_clone(struct sk_buff *skb,
+				 unsigned int __nocast priority);
+extern struct sk_buff *skb_copy(const struct sk_buff *skb,
+				unsigned int __nocast priority);
+extern struct sk_buff *pskb_copy(struct sk_buff *skb,
+				 unsigned int __nocast gfp_mask);
 extern int	       pskb_expand_head(struct sk_buff *skb,
-					int nhead, int ntail, int gfp_mask);
+					int nhead, int ntail,
+					unsigned int __nocast gfp_mask);
 extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 					    unsigned int headroom);
 extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				       int newheadroom, int newtailroom,
-				       int priority);
+				       unsigned int __nocast priority);
 extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
 #define dev_kfree_skb(a)	kfree_skb(a)
 extern void	      skb_over_panic(struct sk_buff *skb, int len,
@@ -464,7 +470,8 @@ static inline int skb_shared(const struct sk_buff *skb)
  *
  *	NULL is returned on a memory allocation failure.
  */
-static inline struct sk_buff *skb_share_check(struct sk_buff *skb, int pri)
+static inline struct sk_buff *skb_share_check(struct sk_buff *skb,
+					      unsigned int __nocast pri)
 {
 	might_sleep_if(pri & __GFP_WAIT);
 	if (skb_shared(skb)) {
@@ -1001,7 +1008,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
  *	%NULL is returned in there is no free memory.
  */
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
-					      int gfp_mask)
+					      unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *skb = alloc_skb(length + 16, gfp_mask);
 	if (likely(skb))
@@ -1114,8 +1121,8 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i,
  *	If there is no free memory -ENOMEM is returned, otherwise zero
  *	is returned and the old skb data released.
  */
-extern int __skb_linearize(struct sk_buff *skb, int gfp);
-static inline int skb_linearize(struct sk_buff *skb, int gfp)
+extern int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp);
+static inline int skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp)
 {
 	return __skb_linearize(skb, gfp);
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index 7b76f891ae2d..a1042d08becd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -684,16 +684,17 @@ extern void FASTCALL(release_sock(struct sock *sk));
 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
-extern struct sock		*sk_alloc(int family, int priority,
+extern struct sock		*sk_alloc(int family,
+					  unsigned int __nocast priority,
 					  struct proto *prot, int zero_it);
 extern void			sk_free(struct sock *sk);
 
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
-					      int priority);
+					      unsigned int __nocast priority);
 extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      unsigned long size, int force,
-					      int priority);
+					      unsigned int __nocast priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
 
@@ -708,7 +709,8 @@ extern struct sk_buff 		*sock_alloc_send_skb(struct sock *sk,
 						     unsigned long size,
 						     int noblock,
 						     int *errcode);
-extern void *sock_kmalloc(struct sock *sk, int size, int priority);
+extern void *sock_kmalloc(struct sock *sk, int size,
+			  unsigned int __nocast priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
 extern void sk_send_sigurg(struct sock *sk);
 
@@ -1132,7 +1134,8 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 }
 
 static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
-						   int size, int mem, int gfp)
+						   int size, int mem,
+						   unsigned int __nocast gfp)
 {
 	struct sk_buff *skb;
 	int hdr_len;
@@ -1155,7 +1158,8 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
 }
 
 static inline struct sk_buff *sk_stream_alloc_skb(struct sock *sk,
-						  int size, int gfp)
+						  int size,
+						  unsigned int __nocast gfp)
 {
 	return sk_stream_alloc_pskb(sk, size, 0, gfp);
 }
@@ -1188,7 +1192,7 @@ static inline int sock_writeable(const struct sock *sk)
 	return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2);
 }
 
-static inline int gfp_any(void)
+static inline unsigned int __nocast gfp_any(void)
 {
 	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4d5b12e4dc11..f4f9aba07ac2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -860,7 +860,8 @@ extern void tcp_send_probe0(struct sock *);
 extern void tcp_send_partial(struct sock *);
 extern int  tcp_write_wakeup(struct sock *);
 extern void tcp_send_fin(struct sock *sk);
-extern void tcp_send_active_reset(struct sock *sk, int priority);
+extern void tcp_send_active_reset(struct sock *sk,
+                                  unsigned int __nocast priority);
 extern int  tcp_send_synack(struct sock *);
 extern void tcp_push_one(struct sock *, unsigned int mss_now);
 extern void tcp_send_ack(struct sock *sk);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7f5f62c65115..ff9dc029233a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1127,7 +1127,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 extern void skb_release_data(struct sk_buff *);
 
 /* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, int gfp_mask)
+int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	unsigned int size;
 	u8 *data;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 733deee24b9f..d9f7b06fe886 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -129,7 +129,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *alloc_skb(unsigned int size, int gfp_mask)
+struct sk_buff *alloc_skb(unsigned int size, unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *skb;
 	u8 *data;
@@ -182,7 +182,8 @@ nodata:
  *	%GFP_ATOMIC.
  */
 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
-				     unsigned int size, int gfp_mask)
+				     unsigned int size,
+				     unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *skb;
 	u8 *data;
@@ -322,7 +323,7 @@ void __kfree_skb(struct sk_buff *skb)
  *	%GFP_ATOMIC.
  */
 
-struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 
@@ -460,7 +461,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  *	header is going to be modified. Use pskb_copy() instead.
  */
 
-struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	int headerlen = skb->data - skb->head;
 	/*
@@ -499,7 +500,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask)
  *	The returned buffer has a reference count of 1.
  */
 
-struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
@@ -557,7 +558,8 @@ out:
  *	reloaded after call to this function.
  */
 
-int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask)
+int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
+		     unsigned int __nocast gfp_mask)
 {
 	int i;
 	u8 *data;
@@ -647,7 +649,8 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
  *	only by netfilter in the cases when checksum is recalculated? --ANK
  */
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
-				int newheadroom, int newtailroom, int gfp_mask)
+				int newheadroom, int newtailroom,
+				unsigned int __nocast gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
diff --git a/net/core/sock.c b/net/core/sock.c
index a6ec3ada7f9e..8b35ccdc2b3b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -622,7 +622,8 @@ lenout:
  *	@prot: struct proto associated with this new sock instance
  *	@zero_it: if we should zero the newly allocated sock
  */
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(int family, unsigned int __nocast priority,
+		      struct proto *prot, int zero_it)
 {
 	struct sock *sk = NULL;
 	kmem_cache_t *slab = prot->slab;
@@ -750,7 +751,8 @@ unsigned long sock_i_ino(struct sock *sk)
 /*
  * Allocate a skb from the socket's send buffer.
  */
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+			     unsigned int __nocast priority)
 {
 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
 		struct sk_buff * skb = alloc_skb(size, priority);
@@ -765,7 +767,8 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
 /*
  * Allocate a skb from the socket's receive buffer.
  */ 
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+			     unsigned int __nocast priority)
 {
 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
 		struct sk_buff *skb = alloc_skb(size, priority);
@@ -780,7 +783,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
 /* 
  * Allocate a memory block from the socket's option memory buffer.
  */ 
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
 {
 	if ((unsigned)size <= sysctl_optmem_max &&
 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e041d057ec86..e3f8ea1bfa9c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1613,7 +1613,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, int priority)
+void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-- 
cgit v1.2.3-59-g8ed1b


From ca9b907d140a5f249250d19f956129dbbbf84f73 Mon Sep 17 00:00:00 2001
From: David L Stevens <dlstevens@us.ibm.com>
Date: Fri, 8 Jul 2005 17:38:07 -0700
Subject: [IPV4]: multicast API "join" issues

        This patch corrects a few problems with the IP_ADD_MEMBERSHIP
socket option:

1) The existing code makes an attempt at reference counting joins when
   using the ip_mreqn/imr_ifindex interface. Joining the same group
   on the same socket is an error, whatever the API. This leads to
   unexpected results when mixing ip_mreqn by index with ip_mreqn by
   address, ip_mreq, or other API's. For example, ip_mreq followed by
   ip_mreqn of the same group will "work" while the same two reversed
   will not.
           Fixed to always return EADDRINUSE on a duplicate join and
   removed the (now unused) reference count in ip_mc_socklist.

2) The group-search list in ip_mc_join_group() is comparing a full
   ip_mreqn structure and all of it must match for it to find the
   group. This doesn't correctly match a group that was joined with
   ip_mreq or ip_mreqn with an address (with or without an index). It
   also doesn't match groups that are joined by different addresses on
   the same interface. All of these are the same multicast group,
   which is identified by group address and interface index.
           Fixed the check to correctly match groups so we don't get
   duplicate group entries on the ip_mc_socklist.

3) The old code allocates a multicast address before searching for
   duplicates requiring it to free in various error cases. This
   patch moves the allocate until after the search and
   igmp_max_memberships check, so never a need to allocate, then free
   an entry.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/igmp.h |  1 -
 net/ipv4/igmp.c      | 35 ++++++++++++-----------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/igmp.h b/include/linux/igmp.h
index 390e760a96d3..0c31ef0b5bad 100644
--- a/include/linux/igmp.h
+++ b/include/linux/igmp.h
@@ -148,7 +148,6 @@ struct ip_sf_socklist
 struct ip_mc_socklist
 {
 	struct ip_mc_socklist	*next;
-	int			count;
 	struct ip_mreqn		multi;
 	unsigned int		sfmode;		/* MCAST_{INCLUDE,EXCLUDE} */
 	struct ip_sf_socklist	*sflist;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 1f3183168a90..111eb678cbac 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1615,9 +1615,10 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 {
 	int err;
 	u32 addr = imr->imr_multiaddr.s_addr;
-	struct ip_mc_socklist *iml, *i;
+	struct ip_mc_socklist *iml=NULL, *i;
 	struct in_device *in_dev;
 	struct inet_sock *inet = inet_sk(sk);
+	int ifindex;
 	int count = 0;
 
 	if (!MULTICAST(addr))
@@ -1633,37 +1634,30 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
 		goto done;
 	}
 
-	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
-
 	err = -EADDRINUSE;
+	ifindex = imr->imr_ifindex;
 	for (i = inet->mc_list; i; i = i->next) {
-		if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
-			/* New style additions are reference counted */
-			if (imr->imr_address.s_addr == 0) {
-				i->count++;
-				err = 0;
-			}
+		if (i->multi.imr_multiaddr.s_addr == addr &&
+		    i->multi.imr_ifindex == ifindex)
 			goto done;
-		}
 		count++;
 	}
 	err = -ENOBUFS;
-	if (iml == NULL || count >= sysctl_igmp_max_memberships)
+	if (count >= sysctl_igmp_max_memberships)
+		goto done;
+	iml = (struct ip_mc_socklist *)sock_kmalloc(sk,sizeof(*iml),GFP_KERNEL);
+	if (iml == NULL)
 		goto done;
+
 	memcpy(&iml->multi, imr, sizeof(*imr));
 	iml->next = inet->mc_list;
-	iml->count = 1;
 	iml->sflist = NULL;
 	iml->sfmode = MCAST_EXCLUDE;
 	inet->mc_list = iml;
 	ip_mc_inc_group(in_dev, addr);
-	iml = NULL;
 	err = 0;
-
 done:
 	rtnl_shunlock();
-	if (iml)
-		sock_kfree_s(sk, iml, sizeof(*iml));
 	return err;
 }
 
@@ -1704,12 +1698,6 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
 			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
 			if (in_dev)
 				(void) ip_mc_leave_src(sk, iml, in_dev);
-			if (--iml->count) {
-				rtnl_unlock();
-				if (in_dev)
-					in_dev_put(in_dev);
-				return 0;
-			}
 
 			*imlp = iml->next;
 
@@ -1755,7 +1743,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
 	err = -EADDRNOTAVAIL;
 
 	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
-		if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+		if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
+		    && pmc->multi.imr_ifindex == imr.imr_ifindex)
 			break;
 	}
 	if (!pmc)		/* must have a prior join */
-- 
cgit v1.2.3-59-g8ed1b


From d369ddd2fc00fc3f46e9052d1017cbf407e3cdf7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 10 Jul 2005 15:45:11 -0700
Subject: [SPARC64]: Add __read_mostly support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c         | 10 +++++-----
 arch/sparc64/kernel/time.c        | 24 +++++++++---------------
 arch/sparc64/kernel/vmlinux.lds.S |  2 ++
 include/linux/cache.h             |  2 +-
 4 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index e5b9c7a27789..441fc2e52ce6 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -45,8 +45,8 @@ extern void calibrate_delay(void);
 /* Please don't make this stuff initdata!!!  --DaveM */
 static unsigned char boot_cpu_id;
 
-cpumask_t cpu_online_map = CPU_MASK_NONE;
-cpumask_t phys_cpu_present_map = CPU_MASK_NONE;
+cpumask_t cpu_online_map = CPU_MASK_NONE __read_mostly;
+cpumask_t phys_cpu_present_map = CPU_MASK_NONE __read_mostly;
 static cpumask_t smp_commenced_mask;
 static cpumask_t cpu_callout_map;
 
@@ -155,7 +155,7 @@ void cpu_panic(void)
 	panic("SMP bolixed\n");
 }
 
-static unsigned long current_tick_offset;
+static unsigned long current_tick_offset __read_mostly;
 
 /* This tick register synchronization scheme is taken entirely from
  * the ia64 port, see arch/ia64/kernel/smpboot.c for details and credit.
@@ -1193,8 +1193,8 @@ void smp_send_stop(void)
 {
 }
 
-unsigned long __per_cpu_base;
-unsigned long __per_cpu_shift;
+unsigned long __per_cpu_base __read_mostly;
+unsigned long __per_cpu_shift __read_mostly;
 
 EXPORT_SYMBOL(__per_cpu_base);
 EXPORT_SYMBOL(__per_cpu_shift);
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index b40db389f90b..362b9c26871b 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -73,7 +73,7 @@ static __initdata struct sparc64_tick_ops dummy_tick_ops = {
 	.get_tick	= dummy_get_tick,
 };
 
-struct sparc64_tick_ops *tick_ops = &dummy_tick_ops;
+struct sparc64_tick_ops *tick_ops __read_mostly = &dummy_tick_ops;
 
 #define TICK_PRIV_BIT	(1UL << 63)
 
@@ -195,7 +195,7 @@ static unsigned long tick_add_tick(unsigned long adj, unsigned long offset)
 	return new_tick;
 }
 
-static struct sparc64_tick_ops tick_operations = {
+static struct sparc64_tick_ops tick_operations __read_mostly = {
 	.init_tick	=	tick_init_tick,
 	.get_tick	=	tick_get_tick,
 	.get_compare	=	tick_get_compare,
@@ -276,7 +276,7 @@ static unsigned long stick_add_compare(unsigned long adj)
 	return new_compare;
 }
 
-static struct sparc64_tick_ops stick_operations = {
+static struct sparc64_tick_ops stick_operations __read_mostly = {
 	.init_tick	=	stick_init_tick,
 	.get_tick	=	stick_get_tick,
 	.get_compare	=	stick_get_compare,
@@ -422,7 +422,7 @@ static unsigned long hbtick_add_compare(unsigned long adj)
 	return val;
 }
 
-static struct sparc64_tick_ops hbtick_operations = {
+static struct sparc64_tick_ops hbtick_operations __read_mostly = {
 	.init_tick	=	hbtick_init_tick,
 	.get_tick	=	hbtick_get_tick,
 	.get_compare	=	hbtick_get_compare,
@@ -437,10 +437,9 @@ static struct sparc64_tick_ops hbtick_operations = {
  * NOTE: On SUN5 systems the ticker interrupt comes in using 2
  *       interrupts, one at level14 and one with softint bit 0.
  */
-unsigned long timer_tick_offset;
-unsigned long timer_tick_compare;
+unsigned long timer_tick_offset __read_mostly;
 
-static unsigned long timer_ticks_per_nsec_quotient;
+static unsigned long timer_ticks_per_nsec_quotient __read_mostly;
 
 #define TICK_SIZE (tick_nsec / 1000)
 
@@ -464,7 +463,7 @@ static inline void timer_check_rtc(void)
 
 static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 {
-	unsigned long ticks, pstate;
+	unsigned long ticks, compare, pstate;
 
 	write_seqlock(&xtime_lock);
 
@@ -483,14 +482,14 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 				     : "=r" (pstate)
 				     : "i" (PSTATE_IE));
 
-		timer_tick_compare = tick_ops->add_compare(timer_tick_offset);
+		compare = tick_ops->add_compare(timer_tick_offset);
 		ticks = tick_ops->get_tick();
 
 		/* Restore PSTATE_IE. */
 		__asm__ __volatile__("wrpr	%0, 0x0, %%pstate"
 				     : /* no outputs */
 				     : "r" (pstate));
-	} while (time_after_eq(ticks, timer_tick_compare));
+	} while (time_after_eq(ticks, compare));
 
 	timer_check_rtc();
 
@@ -506,11 +505,6 @@ void timer_tick_interrupt(struct pt_regs *regs)
 
 	do_timer(regs);
 
-	/*
-	 * Only keep timer_tick_offset uptodate, but don't set TICK_CMPR.
-	 */
-	timer_tick_compare = tick_ops->get_compare() + timer_tick_offset;
-
 	timer_check_rtc();
 
 	write_sequnlock(&xtime_lock);
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index 382fd6798bb9..950423da8a6a 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -32,6 +32,8 @@ SECTIONS
   .data1   : { *(.data1) }
   . = ALIGN(64);
   .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+  . = ALIGN(64);
+  .data.read_mostly : { *(.data.read_mostly) }
   _edata  =  .;
   PROVIDE (edata = .);
   .fixup   : { *(.fixup) }
diff --git a/include/linux/cache.h b/include/linux/cache.h
index 2b66a36d85f0..f6b5a46c5f82 100644
--- a/include/linux/cache.h
+++ b/include/linux/cache.h
@@ -13,7 +13,7 @@
 #define SMP_CACHE_BYTES L1_CACHE_BYTES
 #endif
 
-#ifdef CONFIG_X86
+#if defined(CONFIG_X86) || defined(CONFIG_SPARC64)
 #define __read_mostly __attribute__((__section__(".data.read_mostly")))
 #else
 #define __read_mostly
-- 
cgit v1.2.3-59-g8ed1b


From f7ceba360cce9af3fbc4e5a5b1bd40b570b7021c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 10 Jul 2005 19:29:45 -0700
Subject: [SPARC64]: Add syscall auditing support.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/entry.S       | 10 +++++-----
 arch/sparc64/kernel/ptrace.c      | 32 +++++++++++++++++++++++++++++---
 include/asm-sparc64/thread_info.h |  8 +++++---
 include/linux/audit.h             |  2 +-
 init/Kconfig                      |  2 +-
 5 files changed, 41 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/kernel/entry.S b/arch/sparc64/kernel/entry.S
index 8b7ed760c50e..d781f10adc52 100644
--- a/arch/sparc64/kernel/entry.S
+++ b/arch/sparc64/kernel/entry.S
@@ -1552,7 +1552,7 @@ sys_ptrace:	add		%sp, PTREGS_OFF, %o0
 		nop
 		.align		32
 1:		ldx		[%curptr + TI_FLAGS], %l5
-		andcc		%l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP), %g0
+		andcc		%l5, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %g0
 		be,pt		%icc, rtrap
 		 clr		%l6
 		add		%sp, PTREGS_OFF, %o0
@@ -1679,7 +1679,7 @@ linux_sparc_syscall32:
 
 	srl		%i5, 0, %o5				! IEU1
 	srl		%i2, 0, %o2				! IEU0	Group
-	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP), %g0 ! IEU0	Group
+	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %g0
 	bne,pn		%icc, linux_syscall_trace32		! CTI
 	 mov		%i0, %l5				! IEU1
 	call		%l7					! CTI	Group brk forced
@@ -1702,7 +1702,7 @@ linux_sparc_syscall:
 
 	mov		%i3, %o3				! IEU1
 	mov		%i4, %o4				! IEU0	Group
-	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP), %g0 ! IEU1	Group+1 bubble
+	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %g0
 	bne,pn		%icc, linux_syscall_trace		! CTI	Group
 	 mov		%i0, %l5				! IEU0
 2:	call		%l7					! CTI	Group brk forced
@@ -1730,7 +1730,7 @@ ret_sys_call:
 1:
 	cmp		%o0, -ERESTART_RESTARTBLOCK
 	bgeu,pn		%xcc, 1f
-	 andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP), %l6
+	 andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %l6
 80:
 	/* System call success, clear Carry condition code. */
 	andn		%g3, %g2, %g3
@@ -1745,7 +1745,7 @@ ret_sys_call:
 	/* System call failure, set Carry condition code.
 	 * Also, get abs(errno) to return to the process.
 	 */
-	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP), %l6	
+	andcc		%l0, (_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT), %l6	
 	sub		%g0, %o0, %o0
 	or		%g3, %g2, %g3
 	stx		%o0, [%sp + PTREGS_OFF + PT_V9_I0]
diff --git a/arch/sparc64/kernel/ptrace.c b/arch/sparc64/kernel/ptrace.c
index c57dc9ea731b..23ad839d113f 100644
--- a/arch/sparc64/kernel/ptrace.c
+++ b/arch/sparc64/kernel/ptrace.c
@@ -19,6 +19,8 @@
 #include <linux/smp.h>
 #include <linux/smp_lock.h>
 #include <linux/security.h>
+#include <linux/seccomp.h>
+#include <linux/audit.h>
 #include <linux/signal.h>
 
 #include <asm/asi.h>
@@ -633,10 +635,22 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 	/* do the secure computing check first */
 	secure_computing(regs->u_regs[UREG_G1]);
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
+	if (unlikely(current->audit_context) && syscall_exit_p) {
+		unsigned long tstate = regs->tstate;
+		int result = AUDITSC_SUCCESS;
+
+		if (unlikely(tstate & (TSTATE_XCARRY | TSTATE_ICARRY)))
+			result = AUDITSC_FAILURE;
+
+		audit_syscall_exit(current, result, regs->u_regs[UREG_I0]);
+	}
+
 	if (!(current->ptrace & PT_PTRACED))
-		return;
+		goto out;
+
+	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+		goto out;
+
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
 				 ? 0x80 : 0));
 
@@ -649,4 +663,16 @@ asmlinkage void syscall_trace(struct pt_regs *regs, int syscall_exit_p)
 		send_sig(current->exit_code, current, 1);
 		current->exit_code = 0;
 	}
+
+out:
+	if (unlikely(current->audit_context) && !syscall_exit_p)
+		audit_syscall_entry(current,
+				    (test_thread_flag(TIF_32BIT) ?
+				     AUDIT_ARCH_SPARC :
+				     AUDIT_ARCH_SPARC64),
+				    regs->u_regs[UREG_G1],
+				    regs->u_regs[UREG_I0],
+				    regs->u_regs[UREG_I1],
+				    regs->u_regs[UREG_I2],
+				    regs->u_regs[UREG_I3]);
 }
diff --git a/include/asm-sparc64/thread_info.h b/include/asm-sparc64/thread_info.h
index 6b2fbb89bb63..a1d25c06f92a 100644
--- a/include/asm-sparc64/thread_info.h
+++ b/include/asm-sparc64/thread_info.h
@@ -221,7 +221,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define TIF_32BIT		7	/* 32-bit binary */
 #define TIF_NEWCHILD		8	/* just-spawned child process */
 #define TIF_SECCOMP		9	/* secure computing */
-#define TIF_POLLING_NRFLAG	10
+#define TIF_SYSCALL_AUDIT	10	/* syscall auditing active */
 #define TIF_SYSCALL_SUCCESS	11
 /* NOTE: Thread flags >= 12 should be ones we have no interest
  *       in using in assembly, else we can't use the mask as
@@ -229,6 +229,7 @@ register struct thread_info *current_thread_info_reg asm("g6");
  */
 #define TIF_ABI_PENDING		12
 #define TIF_MEMDIE		13
+#define TIF_POLLING_NRFLAG	14
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -240,9 +241,10 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define _TIF_32BIT		(1<<TIF_32BIT)
 #define _TIF_NEWCHILD		(1<<TIF_NEWCHILD)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
-#define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
-#define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_SUCCESS	(1<<TIF_SYSCALL_SUCCESS)
+#define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 #define _TIF_USER_WORK_MASK	((0xff << TI_FLAG_WSAVED_SHIFT) | \
 				 (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
diff --git a/include/linux/audit.h b/include/linux/audit.h
index bf2ad3ba72eb..68aba0c02e49 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -165,7 +165,7 @@
 #define AUDIT_ARCH_SH64		(EM_SH|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_SHEL64	(EM_SH|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_SPARC	(EM_SPARC)
-#define AUDIT_ARCH_SPARC64	(EM_SPARC64|__AUDIT_ARCH_64BIT)
+#define AUDIT_ARCH_SPARC64	(EM_SPARCV9|__AUDIT_ARCH_64BIT)
 #define AUDIT_ARCH_V850		(EM_V850|__AUDIT_ARCH_LE)
 #define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
 
diff --git a/init/Kconfig b/init/Kconfig
index b1091d7542ce..75755ef50c89 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -174,7 +174,7 @@ config AUDIT
 
 config AUDITSYSCALL
 	bool "Enable system-call auditing support"
-	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML)
+	depends on AUDIT && (X86 || PPC || PPC64 || ARCH_S390 || IA64 || UML || SPARC64)
 	default y if SECURITY_SELINUX
 	help
 	  Enable low-overhead system-call auditing infrastructure that
-- 
cgit v1.2.3-59-g8ed1b


From 63522f7fdb624adef20cb9d90c7effcd5b6301b2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 11 Jul 2005 14:29:11 -0700
Subject: [NETLINK]: Reserve NETLINK_NETFILTER.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 27e4d164a108..2f0c085f2c7d 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -16,6 +16,7 @@
 #define NETLINK_AUDIT		9	/* auditing */
 #define NETLINK_FIB_LOOKUP	10	
 #define NETLINK_ROUTE6		11	/* af_inet6 route comm channel */
+#define NETLINK_NETFILTER	12	/* netfilter subsystem */
 #define NETLINK_IP6_FW		13
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
-- 
cgit v1.2.3-59-g8ed1b


From e2a5b420f716cd1a46674b1a90389612eced916f Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Date: Fri, 18 Mar 2005 16:20:46 -0500
Subject: [ACPI] ACPI poweroff fix

Register an "acpi" system device to be notified of shutdown preparation.
This depends on CONFIG_PM

http://bugzilla.kernel.org/show_bug.cgi?id=4041

Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep/main.c     | 74 +++++++++++++++++----------------------
 drivers/acpi/sleep/poweroff.c | 81 ++++++++++++++++++++++++++++++++++++++-----
 drivers/base/sys.c            |  1 -
 include/linux/pm.h            |  2 +-
 kernel/power/main.c           |  2 +-
 5 files changed, 107 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 0a5d2a94131e..7249ba2b7a27 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -1,6 +1,7 @@
 /*
  * sleep.c - ACPI sleep support.
  *
+ * Copyright (c) 2005 Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
  * Copyright (c) 2004 David Shaohua Li <shaohua.li@intel.com>
  * Copyright (c) 2000-2003 Patrick Mochel
  * Copyright (c) 2003 Open Source Development Lab
@@ -14,7 +15,6 @@
 #include <linux/dmi.h>
 #include <linux/device.h>
 #include <linux/suspend.h>
-#include <asm/io.h>
 #include <acpi/acpi_bus.h>
 #include <acpi/acpi_drivers.h>
 #include "sleep.h"
@@ -27,10 +27,11 @@ extern void do_suspend_lowlevel_s4bios(void);
 extern void do_suspend_lowlevel(void);
 
 static u32 acpi_suspend_states[] = {
-	[PM_SUSPEND_ON]		= ACPI_STATE_S0,
-	[PM_SUSPEND_STANDBY]	= ACPI_STATE_S1,
-	[PM_SUSPEND_MEM]	= ACPI_STATE_S3,
-	[PM_SUSPEND_DISK]	= ACPI_STATE_S4,
+	[PM_SUSPEND_ON] = ACPI_STATE_S0,
+	[PM_SUSPEND_STANDBY] = ACPI_STATE_S1,
+	[PM_SUSPEND_MEM] = ACPI_STATE_S3,
+	[PM_SUSPEND_DISK] = ACPI_STATE_S4,
+	[PM_SUSPEND_MAX] = ACPI_STATE_S5
 };
 
 static int init_8259A_after_S1;
@@ -44,30 +45,20 @@ static int init_8259A_after_S1;
  *	wakeup code to the waking vector. 
  */
 
+extern int acpi_sleep_prepare(u32 acpi_state);
+extern void acpi_power_off(void);
+
 static int acpi_pm_prepare(suspend_state_t pm_state)
 {
 	u32 acpi_state = acpi_suspend_states[pm_state];
 
-	if (!sleep_states[acpi_state])
+	if (!sleep_states[acpi_state]) {
+		printk("acpi_pm_prepare does not support %d \n", pm_state);
 		return -EPERM;
-
-	/* do we have a wakeup address for S2 and S3? */
-	/* Here, we support only S4BIOS, those we set the wakeup address */
-	/* S4OS is only supported for now via swsusp.. */
-	if (pm_state == PM_SUSPEND_MEM || pm_state == PM_SUSPEND_DISK) {
-		if (!acpi_wakeup_address)
-			return -EFAULT;
-		acpi_set_firmware_waking_vector(
-			(acpi_physical_address) virt_to_phys(
-				(void *)acpi_wakeup_address));
 	}
-	ACPI_FLUSH_CPU_CACHE();
-	acpi_enable_wakeup_device_prep(acpi_state);
-	acpi_enter_sleep_state_prep(acpi_state);
-	return 0;
+	return acpi_sleep_prepare(acpi_state);
 }
 
-
 /**
  *	acpi_pm_enter - Actually enter a sleep state.
  *	@pm_state:		State we're entering.
@@ -92,11 +83,9 @@ static int acpi_pm_enter(suspend_state_t pm_state)
 			return error;
 	}
 
-
 	local_irq_save(flags);
 	acpi_enable_wakeup_device(acpi_state);
-	switch (pm_state)
-	{
+	switch (pm_state) {
 	case PM_SUSPEND_STANDBY:
 		barrier();
 		status = acpi_enter_sleep_state(acpi_state);
@@ -112,6 +101,10 @@ static int acpi_pm_enter(suspend_state_t pm_state)
 		else
 			do_suspend_lowlevel_s4bios();
 		break;
+	case PM_SUSPEND_MAX:
+		acpi_power_off();
+		break;
+
 	default:
 		return -EINVAL;
 	}
@@ -126,11 +119,9 @@ static int acpi_pm_enter(suspend_state_t pm_state)
 	if (pm_state > PM_SUSPEND_STANDBY)
 		acpi_restore_state_mem();
 
-
 	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
 }
 
-
 /**
  *	acpi_pm_finish - Finish up suspend sequence.
  *	@pm_state:		State we're coming out of.
@@ -156,27 +147,26 @@ static int acpi_pm_finish(suspend_state_t pm_state)
 	return 0;
 }
 
-
 int acpi_suspend(u32 acpi_state)
 {
 	suspend_state_t states[] = {
-		[1]	= PM_SUSPEND_STANDBY,
-		[3]	= PM_SUSPEND_MEM,
-		[4]	= PM_SUSPEND_DISK,
+		[1] = PM_SUSPEND_STANDBY,
+		[3] = PM_SUSPEND_MEM,
+		[4] = PM_SUSPEND_DISK,
+		[5] = PM_SUSPEND_MAX
 	};
 
-	if (acpi_state <= 4 && states[acpi_state])
+	if (acpi_state < 6 && states[acpi_state])
 		return pm_suspend(states[acpi_state]);
 	return -EINVAL;
 }
 
 static struct pm_ops acpi_pm_ops = {
-	.prepare	= acpi_pm_prepare,
-	.enter		= acpi_pm_enter,
-	.finish		= acpi_pm_finish,
+	.prepare = acpi_pm_prepare,
+	.enter = acpi_pm_enter,
+	.finish = acpi_pm_finish,
 };
 
-
 /*
  * Toshiba fails to preserve interrupts over S1, reinitialization
  * of 8259 is needed after S1 resume.
@@ -190,16 +180,16 @@ static int __init init_ints_after_s1(struct dmi_system_id *d)
 
 static struct dmi_system_id __initdata acpisleep_dmi_table[] = {
 	{
-		.callback = init_ints_after_s1,
-		.ident = "Toshiba Satellite 4030cdt",
-		.matches = { DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), },
-	},
-	{ },
+	 .callback = init_ints_after_s1,
+	 .ident = "Toshiba Satellite 4030cdt",
+	 .matches = {DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),},
+	 },
+	{},
 };
 
 static int __init acpi_sleep_init(void)
 {
-	int			i = 0;
+	int i = 0;
 
 	dmi_check_system(acpisleep_dmi_table);
 
@@ -207,7 +197,7 @@ static int __init acpi_sleep_init(void)
 		return 0;
 
 	printk(KERN_INFO PREFIX "(supports");
-	for (i=0; i < ACPI_S_STATE_COUNT; i++) {
+	for (i = 0; i < ACPI_S_STATE_COUNT; i++) {
 		acpi_status status;
 		u8 type_a, type_b;
 		status = acpi_get_sleep_type_data(i, &type_a, &type_b);
diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index da237754ded9..1fc86e6b5ab9 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -3,35 +3,100 @@
  *
  * AKA S5, but it is independent of whether or not the kernel supports
  * any other sleep support in the system.
+ *
+ * Copyright (c) 2005 Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
+ *
+ * This file is released under the GPLv2.
  */
 
 #include <linux/pm.h>
 #include <linux/init.h>
 #include <acpi/acpi_bus.h>
 #include <linux/sched.h>
+#include <linux/sysdev.h>
+#include <asm/io.h>
 #include "sleep.h"
 
-static void
-acpi_power_off (void)
+int acpi_sleep_prepare(u32 acpi_state)
+{
+	/* Flag to do not allow second time invocation for S5 state */
+	static int shutdown_prepared = 0;
+#ifdef CONFIG_ACPI_SLEEP
+	/* do we have a wakeup address for S2 and S3? */
+	/* Here, we support only S4BIOS, those we set the wakeup address */
+	/* S4OS is only supported for now via swsusp.. */
+	if (acpi_state == ACPI_STATE_S3 || acpi_state == ACPI_STATE_S4) {
+		if (!acpi_wakeup_address) {
+			return -EFAULT;
+		}
+		acpi_set_firmware_waking_vector((acpi_physical_address)
+						virt_to_phys((void *)
+							     acpi_wakeup_address));
+
+	}
+	ACPI_FLUSH_CPU_CACHE();
+	acpi_enable_wakeup_device_prep(acpi_state);
+#endif
+	if (acpi_state == ACPI_STATE_S5) {
+		/* Check if we were already called */
+		if (shutdown_prepared)
+			return 0;
+		acpi_wakeup_gpe_poweroff_prepare();
+		shutdown_prepared = 1;
+	}
+	acpi_enter_sleep_state_prep(acpi_state);
+	return 0;
+}
+
+void acpi_power_off(void)
 {
-	printk("%s called\n",__FUNCTION__);
+	printk("%s called\n", __FUNCTION__);
+	acpi_sleep_prepare(ACPI_STATE_S5);
+	local_irq_disable();
 	/* Some SMP machines only can poweroff in boot CPU */
 	set_cpus_allowed(current, cpumask_of_cpu(0));
-	acpi_wakeup_gpe_poweroff_prepare();
-	acpi_enter_sleep_state_prep(ACPI_STATE_S5);
-	ACPI_DISABLE_IRQS();
 	acpi_enter_sleep_state(ACPI_STATE_S5);
 }
 
+#ifdef CONFIG_PM
+
+static int acpi_shutdown(struct sys_device *x)
+{
+	return acpi_sleep_prepare(ACPI_STATE_S5);
+}
+
+static struct sysdev_class acpi_sysclass = {
+	set_kset_name("acpi"),
+	.shutdown = acpi_shutdown
+};
+
+static struct sys_device device_acpi = {
+	.id = 0,
+	.cls = &acpi_sysclass,
+};
+
+#endif
+
 static int acpi_poweroff_init(void)
 {
 	if (!acpi_disabled) {
 		u8 type_a, type_b;
 		acpi_status status;
 
-		status = acpi_get_sleep_type_data(ACPI_STATE_S5, &type_a, &type_b);
-		if (ACPI_SUCCESS(status))
+		status =
+		    acpi_get_sleep_type_data(ACPI_STATE_S5, &type_a, &type_b);
+		if (ACPI_SUCCESS(status)) {
 			pm_power_off = acpi_power_off;
+#ifdef CONFIG_PM
+			{
+				int error;
+				error = sysdev_class_register(&acpi_sysclass);
+				if (!error)
+					error = sysdev_register(&device_acpi);
+				return error;
+			}
+#endif
+		}
 	}
 	return 0;
 }
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 9102e3756f95..5474bf9622d5 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -22,7 +22,6 @@
 #include <linux/string.h>
 #include <linux/pm.h>
 
-
 extern struct subsystem devices_subsys;
 
 #define to_sysdev(k) container_of(k, struct sys_device, kobj)
diff --git a/include/linux/pm.h b/include/linux/pm.h
index ed2b76e75199..da88851266b8 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -175,7 +175,7 @@ struct pm_ops {
 };
 
 extern void pm_set_ops(struct pm_ops *);
-
+extern struct pm_ops *pm_ops;
 extern int pm_suspend(suspend_state_t state);
 
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 4cdebc972ff2..c7eb4a833db5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -190,7 +190,7 @@ int software_suspend(void)
 
 int pm_suspend(suspend_state_t state)
 {
-	if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX)
+	if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
 		return enter_state(state);
 	return -EINVAL;
 }
-- 
cgit v1.2.3-59-g8ed1b


From 4e10d12a3d88c88fba3258809aa42d14fd8cf1d1 Mon Sep 17 00:00:00 2001
From: David Shaohua Li <shaohua.li@intel.com>
Date: Fri, 18 Mar 2005 18:45:35 -0500
Subject: [ACPI] Bind PCI devices with ACPI devices

Implement the framework for binding physical devices
with ACPI devices. A physical bus like PCI bus
should create a 'acpi_bus_type', with:

.find_device:
        For device which has parent such as normal PCI devices.

.find_bridge:
        It's for special devices, such as PCI root bridge
	or IDE controller.  Such devices generally haven't a
	parent or ->bus. We use the special method
	to get an ACPI handle.

Uses new field in struct device: firmware_data

http://bugzilla.kernel.org/show_bug.cgi?id=4277

Signed-off-by: David Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/Makefile   |   2 +-
 drivers/acpi/glue.c     | 362 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/acpi/ibm_acpi.c |   4 +-
 include/acpi/acpi_bus.h |  21 +++
 include/linux/device.h  |   6 +-
 5 files changed, 390 insertions(+), 5 deletions(-)
 create mode 100644 drivers/acpi/glue.c

(limited to 'include/linux')

diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 24eb397e17b8..ad67e8f61e6c 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -36,7 +36,7 @@ processor-objs	+= processor_perflib.o
 endif
 
 obj-$(CONFIG_ACPI_BUS)		+= sleep/
-obj-$(CONFIG_ACPI_BUS)		+= bus.o
+obj-$(CONFIG_ACPI_BUS)		+= bus.o glue.o
 obj-$(CONFIG_ACPI_AC) 		+= ac.o
 obj-$(CONFIG_ACPI_BATTERY)	+= battery.o
 obj-$(CONFIG_ACPI_BUTTON)	+= button.o
diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c
new file mode 100644
index 000000000000..b6d2045caf3e
--- /dev/null
+++ b/drivers/acpi/glue.c
@@ -0,0 +1,362 @@
+/*
+ * Link physical devices with ACPI devices support
+ *
+ * Copyright (c) 2005 David Shaohua Li <shaohua.li@intel.com>
+ * Copyright (c) 2005 Intel Corp.
+ *
+ * This file is released under the GPLv2.
+ */
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/rwsem.h>
+#include <linux/acpi.h>
+
+#define ACPI_GLUE_DEBUG	0
+#if ACPI_GLUE_DEBUG
+#define DBG(x...) printk(PREFIX x)
+#else
+#define DBG(x...)
+#endif
+static LIST_HEAD(bus_type_list);
+static DECLARE_RWSEM(bus_type_sem);
+
+int register_acpi_bus_type(struct acpi_bus_type *type)
+{
+	if (acpi_disabled)
+		return -ENODEV;
+	if (type && type->bus && type->find_device) {
+		down_write(&bus_type_sem);
+		list_add_tail(&type->list, &bus_type_list);
+		up_write(&bus_type_sem);
+		DBG("ACPI bus type %s registered\n", type->bus->name);
+		return 0;
+	}
+	return -ENODEV;
+}
+
+EXPORT_SYMBOL(register_acpi_bus_type);
+
+int unregister_acpi_bus_type(struct acpi_bus_type *type)
+{
+	if (acpi_disabled)
+		return 0;
+	if (type) {
+		down_write(&bus_type_sem);
+		list_del_init(&type->list);
+		up_write(&bus_type_sem);
+		DBG("ACPI bus type %s unregistered\n", type->bus->name);
+		return 0;
+	}
+	return -ENODEV;
+}
+
+EXPORT_SYMBOL(unregister_acpi_bus_type);
+
+static struct acpi_bus_type *acpi_get_bus_type(struct bus_type *type)
+{
+	struct acpi_bus_type *tmp, *ret = NULL;
+
+	down_read(&bus_type_sem);
+	list_for_each_entry(tmp, &bus_type_list, list) {
+		if (tmp->bus == type) {
+			ret = tmp;
+			break;
+		}
+	}
+	up_read(&bus_type_sem);
+	return ret;
+}
+
+static int acpi_find_bridge_device(struct device *dev, acpi_handle * handle)
+{
+	struct acpi_bus_type *tmp;
+	int ret = -ENODEV;
+
+	down_read(&bus_type_sem);
+	list_for_each_entry(tmp, &bus_type_list, list) {
+		if (tmp->find_bridge && !tmp->find_bridge(dev, handle)) {
+			ret = 0;
+			break;
+		}
+	}
+	up_read(&bus_type_sem);
+	return ret;
+}
+
+/* Get PCI root bridge's handle from its segment and bus number */
+struct acpi_find_pci_root {
+	unsigned int seg;
+	unsigned int bus;
+	acpi_handle handle;
+};
+
+static acpi_status
+do_root_bridge_busnr_callback(struct acpi_resource *resource, void *data)
+{
+	int *busnr = (int *)data;
+	struct acpi_resource_address64 address;
+
+	if (resource->id != ACPI_RSTYPE_ADDRESS16 &&
+	    resource->id != ACPI_RSTYPE_ADDRESS32 &&
+	    resource->id != ACPI_RSTYPE_ADDRESS64)
+		return AE_OK;
+
+	acpi_resource_to_address64(resource, &address);
+	if ((address.address_length > 0) &&
+	    (address.resource_type == ACPI_BUS_NUMBER_RANGE))
+		*busnr = address.min_address_range;
+
+	return AE_OK;
+}
+
+static int get_root_bridge_busnr(acpi_handle handle)
+{
+	acpi_status status;
+	int bus, bbn;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer);
+
+	status = acpi_evaluate_integer(handle, METHOD_NAME__BBN, NULL,
+				       (unsigned long *)&bbn);
+	if (status == AE_NOT_FOUND) {
+		/* Assume bus = 0 */
+		printk(KERN_INFO PREFIX
+		       "Assume root bridge [%s] bus is 0\n",
+		       (char *)buffer.pointer);
+		status = AE_OK;
+		bbn = 0;
+	}
+	if (ACPI_FAILURE(status)) {
+		bbn = -ENODEV;
+		goto exit;
+	}
+	if (bbn > 0)
+		goto exit;
+
+	/* _BBN in some systems return 0 for all root bridges */
+	bus = -1;
+	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
+				     do_root_bridge_busnr_callback, &bus);
+	/* If _CRS failed, we just use _BBN */
+	if (ACPI_FAILURE(status) || (bus == -1))
+		goto exit;
+	/* We select _CRS */
+	if (bbn != bus) {
+		printk(KERN_INFO PREFIX
+		       "_BBN and _CRS returns different value for %s. Select _CRS\n",
+		       (char *)buffer.pointer);
+		bbn = bus;
+	}
+      exit:
+	acpi_os_free(buffer.pointer);
+	return bbn;
+}
+
+static acpi_status
+find_pci_rootbridge(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+	struct acpi_find_pci_root *find = (struct acpi_find_pci_root *)context;
+	unsigned long seg, bus;
+	acpi_status status;
+	int tmp;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+	acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer);
+
+	status = acpi_evaluate_integer(handle, METHOD_NAME__SEG, NULL, &seg);
+	if (status == AE_NOT_FOUND) {
+		/* Assume seg = 0 */
+		printk(KERN_INFO PREFIX
+		       "Assume root bridge [%s] segment is 0\n",
+		       (char *)buffer.pointer);
+		status = AE_OK;
+		seg = 0;
+	}
+	if (ACPI_FAILURE(status)) {
+		status = AE_CTRL_DEPTH;
+		goto exit;
+	}
+
+	tmp = get_root_bridge_busnr(handle);
+	if (tmp < 0) {
+		printk(KERN_ERR PREFIX
+		       "Find root bridge failed for %s\n",
+		       (char *)buffer.pointer);
+		status = AE_CTRL_DEPTH;
+		goto exit;
+	}
+	bus = tmp;
+
+	if (seg == find->seg && bus == find->bus)
+		find->handle = handle;
+	status = AE_OK;
+      exit:
+	acpi_os_free(buffer.pointer);
+	return status;
+}
+
+acpi_handle acpi_get_pci_rootbridge_handle(unsigned int seg, unsigned int bus)
+{
+	struct acpi_find_pci_root find = { seg, bus, NULL };
+
+	acpi_get_devices(PCI_ROOT_HID_STRING, find_pci_rootbridge, &find, NULL);
+	return find.handle;
+}
+
+/* Get device's handler per its address under its parent */
+struct acpi_find_child {
+	acpi_handle handle;
+	acpi_integer address;
+};
+
+static acpi_status
+do_acpi_find_child(acpi_handle handle, u32 lvl, void *context, void **rv)
+{
+	acpi_status status;
+	struct acpi_device_info *info;
+	struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+	struct acpi_find_child *find = (struct acpi_find_child *)context;
+
+	status = acpi_get_object_info(handle, &buffer);
+	if (ACPI_SUCCESS(status)) {
+		info = buffer.pointer;
+		if (info->address == find->address)
+			find->handle = handle;
+		acpi_os_free(buffer.pointer);
+	}
+	return AE_OK;
+}
+
+acpi_handle acpi_get_child(acpi_handle parent, acpi_integer address)
+{
+	struct acpi_find_child find = { NULL, address };
+
+	if (!parent)
+		return NULL;
+	acpi_walk_namespace(ACPI_TYPE_DEVICE, parent,
+			    1, do_acpi_find_child, &find, NULL);
+	return find.handle;
+}
+
+EXPORT_SYMBOL(acpi_get_child);
+
+/* Link ACPI devices with physical devices */
+static void acpi_glue_data_handler(acpi_handle handle,
+				   u32 function, void *context)
+{
+	/* we provide an empty handler */
+}
+
+/* Note: a success call will increase reference count by one */
+struct device *acpi_get_physical_device(acpi_handle handle)
+{
+	acpi_status status;
+	struct device *dev;
+
+	status = acpi_get_data(handle, acpi_glue_data_handler, (void **)&dev);
+	if (ACPI_SUCCESS(status))
+		return get_device(dev);
+	return NULL;
+}
+
+EXPORT_SYMBOL(acpi_get_physical_device);
+
+static int acpi_bind_one(struct device *dev, acpi_handle handle)
+{
+	acpi_status status;
+
+	if (dev->firmware_data) {
+		printk(KERN_WARNING PREFIX
+		       "Drivers changed 'firmware_data' for %s\n", dev->bus_id);
+		return -EINVAL;
+	}
+	get_device(dev);
+	status = acpi_attach_data(handle, acpi_glue_data_handler, dev);
+	if (ACPI_FAILURE(status)) {
+		put_device(dev);
+		return -EINVAL;
+	}
+	dev->firmware_data = handle;
+
+	return 0;
+}
+
+static int acpi_unbind_one(struct device *dev)
+{
+	if (!dev->firmware_data)
+		return 0;
+	if (dev == acpi_get_physical_device(dev->firmware_data)) {
+		/* acpi_get_physical_device increase refcnt by one */
+		put_device(dev);
+		acpi_detach_data(dev->firmware_data, acpi_glue_data_handler);
+		dev->firmware_data = NULL;
+		/* acpi_bind_one increase refcnt by one */
+		put_device(dev);
+	} else {
+		printk(KERN_ERR PREFIX
+		       "Oops, 'firmware_data' corrupt for %s\n", dev->bus_id);
+	}
+	return 0;
+}
+
+static int acpi_platform_notify(struct device *dev)
+{
+	struct acpi_bus_type *type;
+	acpi_handle handle;
+	int ret = -EINVAL;
+
+	if (!dev->bus || !dev->parent) {
+		/* bridge devices genernally haven't bus or parent */
+		ret = acpi_find_bridge_device(dev, &handle);
+		goto end;
+	}
+	type = acpi_get_bus_type(dev->bus);
+	if (!type) {
+		printk(KERN_INFO PREFIX "No ACPI bus support for %s\n",
+		       dev->bus_id);
+		ret = -EINVAL;
+		goto end;
+	}
+	if ((ret = type->find_device(dev, &handle)) != 0)
+		printk(KERN_INFO PREFIX "Can't get handler for %s\n",
+		       dev->bus_id);
+      end:
+	if (!ret)
+		acpi_bind_one(dev, handle);
+
+#if ACPI_GLUE_DEBUG
+	if (!ret) {
+		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
+
+		acpi_get_name(dev->firmware_data, ACPI_FULL_PATHNAME, &buffer);
+		DBG("Device %s -> %s\n", dev->bus_id, (char *)buffer.pointer);
+		acpi_os_free(buffer.pointer);
+	} else
+		DBG("Device %s -> No ACPI support\n", dev->bus_id);
+#endif
+
+	return ret;
+}
+
+static int acpi_platform_notify_remove(struct device *dev)
+{
+	acpi_unbind_one(dev);
+	return 0;
+}
+
+static int __init init_acpi_device_notify(void)
+{
+	if (acpi_disabled)
+		return 0;
+	if (platform_notify || platform_notify_remove) {
+		printk(KERN_ERR PREFIX "Can't use platform_notify\n");
+		return 0;
+	}
+	platform_notify = acpi_platform_notify;
+	platform_notify_remove = acpi_platform_notify_remove;
+	return 0;
+}
+
+arch_initcall(init_acpi_device_notify);
diff --git a/drivers/acpi/ibm_acpi.c b/drivers/acpi/ibm_acpi.c
index 6c8291c3e774..ad85e10001f4 100644
--- a/drivers/acpi/ibm_acpi.c
+++ b/drivers/acpi/ibm_acpi.c
@@ -1025,7 +1025,7 @@ static int setup_notify(struct ibm_struct *ibm)
 	return 0;
 }
 
-static int device_add(struct acpi_device *device)
+static int ibmacpi_device_add(struct acpi_device *device)
 {
 	return 0;
 }
@@ -1043,7 +1043,7 @@ static int register_driver(struct ibm_struct *ibm)
 	memset(ibm->driver, 0, sizeof(struct acpi_driver));
 	sprintf(ibm->driver->name, "%s/%s", IBM_NAME, ibm->name);
 	ibm->driver->ids = ibm->hid;
-	ibm->driver->ops.add = &device_add;
+	ibm->driver->ops.add = &ibmacpi_device_add;
 
 	ret = acpi_bus_register_driver(ibm->driver);
 	if (ret < 0) {
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index c627bc408a6b..53b821d7b8a8 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -336,6 +336,27 @@ int acpi_match_ids (struct acpi_device	*device, char	*ids);
 int acpi_create_dir(struct acpi_device *);
 void acpi_remove_dir(struct acpi_device *);
 
+
+/*
+ * Bind physical devices with ACPI devices
+ */
+#include <linux/device.h>
+struct acpi_bus_type {
+	struct list_head	list;
+	struct bus_type		*bus;
+	/* For general devices under the bus*/
+	int (*find_device)(struct device *, acpi_handle*);
+	/* For bridges, such as PCI root bridge, IDE controller */
+	int (*find_bridge)(struct device *, acpi_handle *);
+};
+int register_acpi_bus_type(struct acpi_bus_type *);
+int unregister_acpi_bus_type(struct acpi_bus_type *);
+struct device *acpi_get_physical_device(acpi_handle);
+/* helper */
+acpi_handle acpi_get_child(acpi_handle, acpi_integer);
+acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int);
+#define DEVICE_ACPI_HANDLE(dev) ((acpi_handle)((dev)->firmware_data))
+
 #endif /*CONFIG_ACPI_BUS*/
 
 #endif /*__ACPI_BUS_H__*/
diff --git a/include/linux/device.h b/include/linux/device.h
index df94c0de53f2..de2d6fe349de 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -269,8 +269,10 @@ struct device {
 	struct device_driver *driver;	/* which driver has allocated this
 					   device */
 	void		*driver_data;	/* data private to the driver */
-	void		*platform_data;	/* Platform specific data (e.g. ACPI,
-					   BIOS data relevant to device) */
+	void		*platform_data;	/* Platform specific data, device
+					   core doesn't touch it */
+	void		*firmware_data; /* Firmware specific data (e.g. ACPI,
+					   BIOS data),reserved for device core*/
 	struct dev_pm_info	power;
 
 	u64		*dma_mask;	/* dma mask (if dma'able device) */
-- 
cgit v1.2.3-59-g8ed1b


From 55e59c511cea3c6c721971467c707e9955922bc2 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Thu, 31 Mar 2005 22:51:10 -0500
Subject: [ACPI] Evaluate CPEI Processor Override flag

ACPI 3.0 added a Correctable Platform Error Interrupt (CPEI)
Processor Overide flag to MADT.Platform_Interrupt_Source.
Record the processor that was provided as hint from ACPI.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/ia64/kernel/acpi.c     | 54 +++++++++++++++++++++++++++++++++++++++++++++
 arch/ia64/kernel/mca.c      |  2 +-
 arch/ia64/kernel/topology.c |  7 ++++++
 include/asm-ia64/acpi.h     |  9 ++++++++
 include/linux/acpi.h        |  5 ++++-
 5 files changed, 75 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 72dfd9e7de0f..1c118b72df3c 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -11,6 +11,7 @@
  *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
  *  Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
  *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
+ *  Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -67,6 +68,11 @@ EXPORT_SYMBOL(pm_power_off);
 unsigned char acpi_kbd_controller_present = 1;
 unsigned char acpi_legacy_devices;
 
+static unsigned int __initdata acpi_madt_rev;
+
+unsigned int acpi_cpei_override;
+unsigned int acpi_cpei_phys_cpuid;
+
 #define MAX_SAPICS 256
 u16 ia64_acpiid_to_sapicid[MAX_SAPICS] =
 	{ [0 ... MAX_SAPICS - 1] = -1 };
@@ -267,10 +273,56 @@ acpi_parse_plat_int_src (
 						(plintsrc->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
 
 	platform_intr_list[plintsrc->type] = vector;
+	if (acpi_madt_rev > 1) {
+		acpi_cpei_override = plintsrc->plint_flags.cpei_override_flag;
+	}
+
+	/*
+	 * Save the physical id, so we can check when its being removed
+	 */
+	acpi_cpei_phys_cpuid = ((plintsrc->id << 8) | (plintsrc->eid)) & 0xffff;
+
 	return 0;
 }
 
 
+unsigned int can_cpei_retarget(void)
+{
+	extern int cpe_vector;
+
+	/*
+	 * Only if CPEI is supported and the override flag
+	 * is present, otherwise return that its re-targettable
+	 * if we are in polling mode.
+	 */
+	if (cpe_vector > 0 && !acpi_cpei_override)
+		return 0;
+	else
+		return 1;
+}
+
+unsigned int is_cpu_cpei_target(unsigned int cpu)
+{
+	unsigned int logical_id;
+
+	logical_id = cpu_logical_id(acpi_cpei_phys_cpuid);
+
+	if (logical_id == cpu)
+		return 1;
+	else
+		return 0;
+}
+
+void set_cpei_target_cpu(unsigned int cpu)
+{
+	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
+}
+
+unsigned int get_cpei_target_cpu(void)
+{
+	return acpi_cpei_phys_cpuid;
+}
+
 static int __init
 acpi_parse_int_src_ovr (
 	acpi_table_entry_header *header, const unsigned long end)
@@ -328,6 +380,8 @@ acpi_parse_madt (unsigned long phys_addr, unsigned long size)
 
 	acpi_madt = (struct acpi_table_madt *) __va(phys_addr);
 
+	acpi_madt_rev = acpi_madt->header.revision;
+
 	/* remember the value for reference after free_initmem() */
 #ifdef CONFIG_ITANIUM
 	has_8259 = 1; /* Firmware on old Itanium systems is broken */
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 736e328b5e61..4ebbf3974381 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -271,7 +271,7 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 
 #ifdef CONFIG_ACPI
 
-static int cpe_vector = -1;
+int cpe_vector = -1;
 
 static irqreturn_t
 ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index f1aafd4c05f9..d8030f3bd865 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,6 +36,13 @@ int arch_register_cpu(int num)
 	parent = &sysfs_nodes[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
+	/*
+	 * If CPEI cannot be re-targetted, and this is
+	 * CPEI target, then dont create the control file
+	 */
+	if (!can_cpei_retarget() && is_cpu_cpei_target(num))
+		sysfs_cpus[num].cpu.no_control = 1;
+
 	return register_cpu(&sysfs_cpus[num].cpu, num, parent);
 }
 
diff --git a/include/asm-ia64/acpi.h b/include/asm-ia64/acpi.h
index 6a26a977f253..4c06d455139c 100644
--- a/include/asm-ia64/acpi.h
+++ b/include/asm-ia64/acpi.h
@@ -98,6 +98,15 @@ const char *acpi_get_sysname (void);
 int acpi_request_vector (u32 int_type);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
+/*
+ * Record the cpei override flag and current logical cpu. This is
+ * useful for CPU removal.
+ */
+extern unsigned int can_cpei_retarget(void);
+extern unsigned int is_cpu_cpei_target(unsigned int cpu);
+extern void set_cpei_target_cpu(unsigned int cpu);
+extern unsigned int get_cpei_target_cpu(void);
+
 #ifdef CONFIG_ACPI_NUMA
 /* Proximity bitmap length; _PXM is at most 255 (8 bit)*/
 #define MAX_PXM_DOMAINS (256)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index b123cc08773d..70b3c52b75d7 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -206,7 +206,10 @@ struct acpi_table_plat_int_src {
 	u8			eid;
 	u8			iosapic_vector;
 	u32			global_irq;
-	u32			reserved;
+	struct {
+		u32			cpei_override_flag:1;
+		u32			reserved:31;
+	}			plint_flags;
 } __attribute__ ((packed));
 
 enum acpi_interrupt_id {
-- 
cgit v1.2.3-59-g8ed1b


From c9c3e457de24cca2ca688fa397d93a241f472048 Mon Sep 17 00:00:00 2001
From: David Shaohua Li <shaohua.li@intel.com>
Date: Fri, 1 Apr 2005 00:07:31 -0500
Subject: [ACPI] PNPACPI vs sound IRQ

http://bugme.osdl.org/show_bug.cgi?id=4016

Written-by: David Shaohua Li <shaohua.li@intel.com>
Acked-by: Adam Belay <abelay@novell.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/frv/mb93090-mb00/pci-irq.c |  2 +-
 arch/i386/pci/irq.c             | 16 ++++++++++------
 arch/i386/pci/visws.c           |  2 +-
 drivers/acpi/pci_link.c         |  7 +++++--
 drivers/pnp/pnpacpi/rsparser.c  |  4 ++--
 drivers/pnp/pnpbios/rsparser.c  |  2 +-
 drivers/pnp/resource.c          |  2 +-
 include/asm-alpha/pci.h         |  2 +-
 include/asm-arm/pci.h           |  2 +-
 include/asm-h8300/pci.h         |  2 +-
 include/asm-i386/pci.h          |  2 +-
 include/asm-ia64/pci.h          |  2 +-
 include/asm-m68k/pci.h          |  2 +-
 include/asm-mips/pci.h          |  2 +-
 include/asm-ppc/pci.h           |  2 +-
 include/asm-ppc64/pci.h         |  2 +-
 include/asm-sh/pci.h            |  2 +-
 include/asm-sh64/pci.h          |  2 +-
 include/asm-sparc/pci.h         |  2 +-
 include/asm-sparc64/pci.h       |  2 +-
 include/asm-x86_64/pci.h        |  2 +-
 include/linux/acpi.h            |  2 +-
 22 files changed, 36 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/frv/mb93090-mb00/pci-irq.c b/arch/frv/mb93090-mb00/pci-irq.c
index 24622d89b1ca..af981bda015c 100644
--- a/arch/frv/mb93090-mb00/pci-irq.c
+++ b/arch/frv/mb93090-mb00/pci-irq.c
@@ -60,7 +60,7 @@ void __init pcibios_fixup_irqs(void)
 	}
 }
 
-void __init pcibios_penalize_isa_irq(int irq)
+void __init pcibios_penalize_isa_irq(int irq, int active)
 {
 }
 
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index da21b1d07c15..d21b3a2dc978 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -1006,24 +1006,28 @@ static int __init pcibios_irq_init(void)
 subsys_initcall(pcibios_irq_init);
 
 
-static void pirq_penalize_isa_irq(int irq)
+static void pirq_penalize_isa_irq(int irq, int active)
 {
 	/*
 	 *  If any ISAPnP device reports an IRQ in its list of possible
 	 *  IRQ's, we try to avoid assigning it to PCI devices.
 	 */
-	if (irq < 16)
-		pirq_penalty[irq] += 100;
+	if (irq < 16) {
+		if (active)
+			pirq_penalty[irq] += 1000;
+		else
+			pirq_penalty[irq] += 100;
+	}
 }
 
-void pcibios_penalize_isa_irq(int irq)
+void pcibios_penalize_isa_irq(int irq, int active)
 {
 #ifdef CONFIG_ACPI_PCI
 	if (!acpi_noirq)
-		acpi_penalize_isa_irq(irq);
+		acpi_penalize_isa_irq(irq, active);
 	else
 #endif
-		pirq_penalize_isa_irq(irq);
+		pirq_penalize_isa_irq(irq, active);
 }
 
 static int pirq_enable_irq(struct pci_dev *dev)
diff --git a/arch/i386/pci/visws.c b/arch/i386/pci/visws.c
index 6a9248784439..314c933b6b8e 100644
--- a/arch/i386/pci/visws.c
+++ b/arch/i386/pci/visws.c
@@ -21,7 +21,7 @@ static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
 
 int (*pcibios_enable_irq)(struct pci_dev *dev) = &pci_visws_enable_irq;
 
-void __init pcibios_penalize_isa_irq(int irq) {}
+void __init pcibios_penalize_isa_irq(int irq, int active) {}
 
 
 unsigned int pci_bus0, pci_bus1;
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index f2271173bbd5..6ad0e77df9b3 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -804,9 +804,12 @@ static int __init acpi_irq_penalty_update(char *str, int used)
  * There is no ISA_POSSIBLE weight, so we simply use
  * the (small) PCI_USING penalty.
  */
-void acpi_penalize_isa_irq(int irq)
+void acpi_penalize_isa_irq(int irq, int active)
 {
-	acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
+	if (active)
+		acpi_irq_penalty[irq] += PIRQ_PENALTY_ISA_USED;
+	else
+		acpi_irq_penalty[irq] += PIRQ_PENALTY_PCI_USING;
 }
 
 /*
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index ae3819ad7cf4..75575f6c349c 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -160,7 +160,7 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 				acpi_register_gsi(res->data.irq.interrupts[0],
 					res->data.irq.edge_level,
 					res->data.irq.active_high_low));
-			pcibios_penalize_isa_irq(res->data.irq.interrupts[0]);
+			pcibios_penalize_isa_irq(res->data.irq.interrupts[0], 1);
 		}
 		break;
 
@@ -171,7 +171,7 @@ static acpi_status pnpacpi_allocated_resource(struct acpi_resource *res,
 				acpi_register_gsi(res->data.extended_irq.interrupts[0],
 					res->data.extended_irq.edge_level,
 					res->data.extended_irq.active_high_low));
-			pcibios_penalize_isa_irq(res->data.extended_irq.interrupts[0]);
+			pcibios_penalize_isa_irq(res->data.extended_irq.interrupts[0], 1);
 		}
 		break;
 	case ACPI_RSTYPE_DMA:
diff --git a/drivers/pnp/pnpbios/rsparser.c b/drivers/pnp/pnpbios/rsparser.c
index 79bce7b75740..9001b6f0204d 100644
--- a/drivers/pnp/pnpbios/rsparser.c
+++ b/drivers/pnp/pnpbios/rsparser.c
@@ -64,7 +64,7 @@ pnpbios_parse_allocated_irqresource(struct pnp_resource_table * res, int irq)
 		}
 		res->irq_resource[i].start =
 		res->irq_resource[i].end = (unsigned long) irq;
-		pcibios_penalize_isa_irq(irq);
+		pcibios_penalize_isa_irq(irq, 1);
 	}
 }
 
diff --git a/drivers/pnp/resource.c b/drivers/pnp/resource.c
index 2d1322dd7e19..887ad8939349 100644
--- a/drivers/pnp/resource.c
+++ b/drivers/pnp/resource.c
@@ -102,7 +102,7 @@ int pnp_register_irq_resource(struct pnp_option *option, struct pnp_irq *data)
 
 		for (i = 0; i < 16; i++)
 			if (test_bit(i, data->map))
-				pcibios_penalize_isa_irq(i);
+				pcibios_penalize_isa_irq(i, 0);
 	}
 #endif
 	return 0;
diff --git a/include/asm-alpha/pci.h b/include/asm-alpha/pci.h
index 0c7b57bc043a..7109860f98ec 100644
--- a/include/asm-alpha/pci.h
+++ b/include/asm-alpha/pci.h
@@ -58,7 +58,7 @@ struct pci_controller {
 
 extern void pcibios_set_master(struct pci_dev *dev);
 
-extern inline void pcibios_penalize_isa_irq(int irq)
+extern inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-arm/pci.h b/include/asm-arm/pci.h
index 40ffaefbeb1a..0f437e262314 100644
--- a/include/asm-arm/pci.h
+++ b/include/asm-arm/pci.h
@@ -14,7 +14,7 @@ static inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-h8300/pci.h b/include/asm-h8300/pci.h
index d032729b19df..5edad5b70fd5 100644
--- a/include/asm-h8300/pci.h
+++ b/include/asm-h8300/pci.h
@@ -15,7 +15,7 @@ extern inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-extern inline void pcibios_penalize_isa_irq(int irq)
+extern inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-i386/pci.h b/include/asm-i386/pci.h
index fb749b85a739..e0dc1cea0b7c 100644
--- a/include/asm-i386/pci.h
+++ b/include/asm-i386/pci.h
@@ -27,7 +27,7 @@ void pcibios_config_init(void);
 struct pci_bus * pcibios_scan_root(int bus);
 
 void pcibios_set_master(struct pci_dev *dev);
-void pcibios_penalize_isa_irq(int irq);
+void pcibios_penalize_isa_irq(int irq, int active);
 struct irq_routing_table *pcibios_get_irq_routing_table(void);
 int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
diff --git a/include/asm-ia64/pci.h b/include/asm-ia64/pci.h
index a8314ee4e7d2..1cbd10b96b3a 100644
--- a/include/asm-ia64/pci.h
+++ b/include/asm-ia64/pci.h
@@ -47,7 +47,7 @@ pcibios_set_master (struct pci_dev *dev)
 }
 
 static inline void
-pcibios_penalize_isa_irq (int irq)
+pcibios_penalize_isa_irq (int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-m68k/pci.h b/include/asm-m68k/pci.h
index 9e7d79ab5d13..9d2c07abe44f 100644
--- a/include/asm-m68k/pci.h
+++ b/include/asm-m68k/pci.h
@@ -43,7 +43,7 @@ static inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-mips/pci.h b/include/asm-mips/pci.h
index c9c576b48556..3bf1cb5cd548 100644
--- a/include/asm-mips/pci.h
+++ b/include/asm-mips/pci.h
@@ -69,7 +69,7 @@ extern unsigned long PCIBIOS_MIN_MEM;
 
 extern void pcibios_set_master(struct pci_dev *dev);
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-ppc/pci.h b/include/asm-ppc/pci.h
index ce5ae6d048f5..ebd34fffc730 100644
--- a/include/asm-ppc/pci.h
+++ b/include/asm-ppc/pci.h
@@ -37,7 +37,7 @@ extern inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-extern inline void pcibios_penalize_isa_irq(int irq)
+extern inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h
index 6cd593f660a0..7c11687df3bb 100644
--- a/include/asm-ppc64/pci.h
+++ b/include/asm-ppc64/pci.h
@@ -37,7 +37,7 @@ static inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-sh/pci.h b/include/asm-sh/pci.h
index 9c3b63d0105e..92bcb03426fe 100644
--- a/include/asm-sh/pci.h
+++ b/include/asm-sh/pci.h
@@ -36,7 +36,7 @@ struct pci_dev;
 
 extern void pcibios_set_master(struct pci_dev *dev);
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-sh64/pci.h b/include/asm-sh64/pci.h
index 8cc14e139750..ea711108f0e7 100644
--- a/include/asm-sh64/pci.h
+++ b/include/asm-sh64/pci.h
@@ -26,7 +26,7 @@ extern void pcibios_set_master(struct pci_dev *dev);
 /*
  * Set penalize isa irq function
  */
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-sparc/pci.h b/include/asm-sparc/pci.h
index d200a25a7373..d875d9496a8c 100644
--- a/include/asm-sparc/pci.h
+++ b/include/asm-sparc/pci.h
@@ -20,7 +20,7 @@ extern inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-extern inline void pcibios_penalize_isa_irq(int irq)
+extern inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-sparc64/pci.h b/include/asm-sparc64/pci.h
index 2a0c85cd1c11..e38d6598d626 100644
--- a/include/asm-sparc64/pci.h
+++ b/include/asm-sparc64/pci.h
@@ -23,7 +23,7 @@ static inline void pcibios_set_master(struct pci_dev *dev)
 	/* No special bus mastering setup handling */
 }
 
-static inline void pcibios_penalize_isa_irq(int irq)
+static inline void pcibios_penalize_isa_irq(int irq, int active)
 {
 	/* We don't do dynamic PCI IRQ allocation */
 }
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 8712520ca47f..9e8c273b785e 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -33,7 +33,7 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn, int reg, int le
 extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value);
 
 void pcibios_set_master(struct pci_dev *dev);
-void pcibios_penalize_isa_irq(int irq);
+void pcibios_penalize_isa_irq(int irq, int active);
 struct irq_routing_table *pcibios_get_irq_routing_table(void);
 int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 70b3c52b75d7..9c14959bcfa0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -465,7 +465,7 @@ struct acpi_prt_list {
 struct pci_dev;
 
 int acpi_pci_irq_enable (struct pci_dev *dev);
-void acpi_penalize_isa_irq(int irq);
+void acpi_penalize_isa_irq(int irq, int active);
 
 #ifdef CONFIG_ACPI_DEALLOCATE_IRQ
 void acpi_pci_irq_disable (struct pci_dev *dev);
-- 
cgit v1.2.3-59-g8ed1b


From 5db539e49fc7471e23bf3c94ca304f008cb7b7f3 Mon Sep 17 00:00:00 2001
From: Olav Kongas <ok@artecdesign.ee>
Date: Thu, 23 Jun 2005 20:25:36 +0300
Subject: [PATCH] USB: Fix kmalloc's flags type in USB

Greg,

This patch fixes the kmalloc() flags argument type in USB
subsystem; hopefully all of its occurences. The patch was
made against patch-2.6.12-git2 from Jun 20.

Cleanup of flags for kmalloc() in USB subsystem.

Signed-off-by: Olav Kongas <ok@artecdesign.ee>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/usb/core/buffer.c        |  2 +-
 drivers/usb/core/hcd.c           |  2 +-
 drivers/usb/core/hcd.h           |  8 ++++----
 drivers/usb/core/message.c       |  2 +-
 drivers/usb/core/urb.c           |  4 ++--
 drivers/usb/core/usb.c           |  2 +-
 drivers/usb/gadget/dummy_hcd.c   |  9 +++++----
 drivers/usb/gadget/ether.c       | 18 +++++++++---------
 drivers/usb/gadget/goku_udc.c    |  6 +++---
 drivers/usb/gadget/lh7a40x_udc.c |  6 +++---
 drivers/usb/gadget/net2280.c     |  6 +++---
 drivers/usb/gadget/omap_udc.c    |  6 +++---
 drivers/usb/gadget/pxa2xx_udc.c  |  6 +++---
 drivers/usb/gadget/zero.c        |  8 ++++----
 drivers/usb/host/ehci-hcd.c      |  2 +-
 drivers/usb/host/ehci-q.c        |  2 +-
 drivers/usb/host/ehci-sched.c    | 19 +++++++++++--------
 drivers/usb/host/hc_crisv10.c    | 10 ++++++----
 drivers/usb/host/isp116x-hcd.c   |  4 ++--
 drivers/usb/host/ohci-hcd.c      |  2 +-
 drivers/usb/host/ohci-mem.c      |  4 ++--
 drivers/usb/host/sl811-hcd.c     |  2 +-
 drivers/usb/host/uhci-q.c        |  2 +-
 drivers/usb/net/kaweth.c         |  4 ++--
 include/linux/usb.h              |  8 ++++----
 include/linux/usb_gadget.h       | 12 ++++++------
 26 files changed, 81 insertions(+), 75 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c
index b7827df21f48..fc15b4acc8af 100644
--- a/drivers/usb/core/buffer.c
+++ b/drivers/usb/core/buffer.c
@@ -106,7 +106,7 @@ void hcd_buffer_destroy (struct usb_hcd *hcd)
 void *hcd_buffer_alloc (
 	struct usb_bus 		*bus,
 	size_t			size,
-	int			mem_flags,
+	unsigned		mem_flags,
 	dma_addr_t		*dma
 )
 {
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index 83e732a0d64a..8616356f55e8 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -1112,7 +1112,7 @@ static void urb_unlink (struct urb *urb)
  * expects usb_submit_urb() to have sanity checked and conditioned all
  * inputs in the urb
  */
-static int hcd_submit_urb (struct urb *urb, int mem_flags)
+static int hcd_submit_urb (struct urb *urb, unsigned mem_flags)
 {
 	int			status;
 	struct usb_hcd		*hcd = urb->dev->bus->hcpriv;
diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h
index 8dc13cde2f73..67db4a999b93 100644
--- a/drivers/usb/core/hcd.h
+++ b/drivers/usb/core/hcd.h
@@ -142,12 +142,12 @@ struct hcd_timeout {	/* timeouts we allocate */
 
 struct usb_operations {
 	int (*get_frame_number) (struct usb_device *usb_dev);
-	int (*submit_urb) (struct urb *urb, int mem_flags);
+	int (*submit_urb) (struct urb *urb, unsigned mem_flags);
 	int (*unlink_urb) (struct urb *urb, int status);
 
 	/* allocate dma-consistent buffer for URB_DMA_NOMAPPING */
 	void *(*buffer_alloc)(struct usb_bus *bus, size_t size,
-			int mem_flags,
+			unsigned mem_flags,
 			dma_addr_t *dma);
 	void (*buffer_free)(struct usb_bus *bus, size_t size,
 			void *addr, dma_addr_t dma);
@@ -200,7 +200,7 @@ struct hc_driver {
 	int	(*urb_enqueue) (struct usb_hcd *hcd,
 					struct usb_host_endpoint *ep,
 					struct urb *urb,
-					int mem_flags);
+					unsigned mem_flags);
 	int	(*urb_dequeue) (struct usb_hcd *hcd, struct urb *urb);
 
 	/* hw synch, freeing endpoint resources that urb_dequeue can't */
@@ -247,7 +247,7 @@ int hcd_buffer_create (struct usb_hcd *hcd);
 void hcd_buffer_destroy (struct usb_hcd *hcd);
 
 void *hcd_buffer_alloc (struct usb_bus *bus, size_t size,
-	int mem_flags, dma_addr_t *dma);
+	unsigned mem_flags, dma_addr_t *dma);
 void hcd_buffer_free (struct usb_bus *bus, size_t size,
 	void *addr, dma_addr_t dma);
 
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index f50aaf25c98e..a428ef479bd7 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -320,7 +320,7 @@ int usb_sg_init (
 	struct scatterlist	*sg,
 	int			nents,
 	size_t			length,
-	int			mem_flags
+	unsigned		mem_flags
 )
 {
 	int			i;
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 0faf18d511de..c0feee25ff0a 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -60,7 +60,7 @@ void usb_init_urb(struct urb *urb)
  *
  * The driver must call usb_free_urb() when it is finished with the urb.
  */
-struct urb *usb_alloc_urb(int iso_packets, int mem_flags)
+struct urb *usb_alloc_urb(int iso_packets, unsigned mem_flags)
 {
 	struct urb *urb;
 
@@ -224,7 +224,7 @@ struct urb * usb_get_urb(struct urb *urb)
  *      GFP_NOIO, unless b) or c) apply
  *
  */
-int usb_submit_urb(struct urb *urb, int mem_flags)
+int usb_submit_urb(struct urb *urb, unsigned mem_flags)
 {
 	int			pipe, temp, max;
 	struct usb_device	*dev;
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index a3c42203213a..7713a605fce7 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -1129,7 +1129,7 @@ int __usb_get_extra_descriptor(char *buffer, unsigned size,
 void *usb_buffer_alloc (
 	struct usb_device *dev,
 	size_t size,
-	int mem_flags,
+	unsigned mem_flags,
 	dma_addr_t *dma
 )
 {
diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index 4d692670f288..583db7c38cf1 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -470,7 +470,7 @@ static int dummy_disable (struct usb_ep *_ep)
 }
 
 static struct usb_request *
-dummy_alloc_request (struct usb_ep *_ep, int mem_flags)
+dummy_alloc_request (struct usb_ep *_ep, unsigned mem_flags)
 {
 	struct dummy_ep		*ep;
 	struct dummy_request	*req;
@@ -507,7 +507,7 @@ dummy_alloc_buffer (
 	struct usb_ep *_ep,
 	unsigned bytes,
 	dma_addr_t *dma,
-	int mem_flags
+	unsigned mem_flags
 ) {
 	char			*retval;
 	struct dummy_ep		*ep;
@@ -540,7 +540,8 @@ fifo_complete (struct usb_ep *ep, struct usb_request *req)
 }
 
 static int
-dummy_queue (struct usb_ep *_ep, struct usb_request *_req, int mem_flags)
+dummy_queue (struct usb_ep *_ep, struct usb_request *_req,
+		unsigned mem_flags)
 {
 	struct dummy_ep		*ep;
 	struct dummy_request	*req;
@@ -998,7 +999,7 @@ static int dummy_urb_enqueue (
 	struct usb_hcd			*hcd,
 	struct usb_host_endpoint	*ep,
 	struct urb			*urb,
-	int				mem_flags
+	unsigned			mem_flags
 ) {
 	struct dummy	*dum;
 	struct urbp	*urbp;
diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c
index 5bb53ae88969..00a5d2566265 100644
--- a/drivers/usb/gadget/ether.c
+++ b/drivers/usb/gadget/ether.c
@@ -945,11 +945,11 @@ config_buf (enum usb_device_speed speed,
 
 /*-------------------------------------------------------------------------*/
 
-static void eth_start (struct eth_dev *dev, int gfp_flags);
-static int alloc_requests (struct eth_dev *dev, unsigned n, int gfp_flags);
+static void eth_start (struct eth_dev *dev, unsigned gfp_flags);
+static int alloc_requests (struct eth_dev *dev, unsigned n, unsigned gfp_flags);
 
 static int
-set_ether_config (struct eth_dev *dev, int gfp_flags)
+set_ether_config (struct eth_dev *dev, unsigned gfp_flags)
 {
 	int					result = 0;
 	struct usb_gadget			*gadget = dev->gadget;
@@ -1079,7 +1079,7 @@ static void eth_reset_config (struct eth_dev *dev)
  * that returns config descriptors, and altsetting code.
  */
 static int
-eth_set_config (struct eth_dev *dev, unsigned number, int gfp_flags)
+eth_set_config (struct eth_dev *dev, unsigned number, unsigned gfp_flags)
 {
 	int			result = 0;
 	struct usb_gadget	*gadget = dev->gadget;
@@ -1596,7 +1596,7 @@ static void defer_kevent (struct eth_dev *dev, int flag)
 static void rx_complete (struct usb_ep *ep, struct usb_request *req);
 
 static int
-rx_submit (struct eth_dev *dev, struct usb_request *req, int gfp_flags)
+rx_submit (struct eth_dev *dev, struct usb_request *req, unsigned gfp_flags)
 {
 	struct sk_buff		*skb;
 	int			retval = -ENOMEM;
@@ -1722,7 +1722,7 @@ clean:
 }
 
 static int prealloc (struct list_head *list, struct usb_ep *ep,
-			unsigned n, int gfp_flags)
+			unsigned n, unsigned gfp_flags)
 {
 	unsigned		i;
 	struct usb_request	*req;
@@ -1761,7 +1761,7 @@ extra:
 	return 0;
 }
 
-static int alloc_requests (struct eth_dev *dev, unsigned n, int gfp_flags)
+static int alloc_requests (struct eth_dev *dev, unsigned n, unsigned gfp_flags)
 {
 	int status;
 
@@ -1777,7 +1777,7 @@ fail:
 	return status;
 }
 
-static void rx_fill (struct eth_dev *dev, int gfp_flags)
+static void rx_fill (struct eth_dev *dev, unsigned gfp_flags)
 {
 	struct usb_request	*req;
 	unsigned long		flags;
@@ -2022,7 +2022,7 @@ static int rndis_control_ack (struct net_device *net)
 
 #endif	/* RNDIS */
 
-static void eth_start (struct eth_dev *dev, int gfp_flags)
+static void eth_start (struct eth_dev *dev, unsigned gfp_flags)
 {
 	DEBUG (dev, "%s\n", __FUNCTION__);
 
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index ed773a9111de..eaab26f4ed37 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -269,7 +269,7 @@ static int goku_ep_disable(struct usb_ep *_ep)
 /*-------------------------------------------------------------------------*/
 
 static struct usb_request *
-goku_alloc_request(struct usb_ep *_ep, int gfp_flags)
+goku_alloc_request(struct usb_ep *_ep, unsigned gfp_flags)
 {
 	struct goku_request	*req;
 
@@ -327,7 +327,7 @@ goku_free_request(struct usb_ep *_ep, struct usb_request *_req)
  */
 static void *
 goku_alloc_buffer(struct usb_ep *_ep, unsigned bytes,
-			dma_addr_t *dma, int  gfp_flags)
+			dma_addr_t *dma, unsigned gfp_flags)
 {
 	void		*retval;
 	struct goku_ep	*ep;
@@ -789,7 +789,7 @@ finished:
 /*-------------------------------------------------------------------------*/
 
 static int
-goku_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags)
+goku_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags)
 {
 	struct goku_request	*req;
 	struct goku_ep		*ep;
diff --git a/drivers/usb/gadget/lh7a40x_udc.c b/drivers/usb/gadget/lh7a40x_udc.c
index df75ab65a5ec..4842577789c9 100644
--- a/drivers/usb/gadget/lh7a40x_udc.c
+++ b/drivers/usb/gadget/lh7a40x_udc.c
@@ -1106,7 +1106,7 @@ static int lh7a40x_ep_disable(struct usb_ep *_ep)
 }
 
 static struct usb_request *lh7a40x_alloc_request(struct usb_ep *ep,
-						 int gfp_flags)
+						 unsigned gfp_flags)
 {
 	struct lh7a40x_request *req;
 
@@ -1134,7 +1134,7 @@ static void lh7a40x_free_request(struct usb_ep *ep, struct usb_request *_req)
 }
 
 static void *lh7a40x_alloc_buffer(struct usb_ep *ep, unsigned bytes,
-				  dma_addr_t * dma, int gfp_flags)
+				  dma_addr_t * dma, unsigned gfp_flags)
 {
 	char *retval;
 
@@ -1158,7 +1158,7 @@ static void lh7a40x_free_buffer(struct usb_ep *ep, void *buf, dma_addr_t dma,
  *  NOTE: Sets INDEX register
  */
 static int lh7a40x_queue(struct usb_ep *_ep, struct usb_request *_req,
-			 int gfp_flags)
+			 unsigned gfp_flags)
 {
 	struct lh7a40x_request *req;
 	struct lh7a40x_ep *ep;
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index 13a3dbc9949b..234a1a97b84e 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -376,7 +376,7 @@ static int net2280_disable (struct usb_ep *_ep)
 /*-------------------------------------------------------------------------*/
 
 static struct usb_request *
-net2280_alloc_request (struct usb_ep *_ep, int gfp_flags)
+net2280_alloc_request (struct usb_ep *_ep, unsigned gfp_flags)
 {
 	struct net2280_ep	*ep;
 	struct net2280_request	*req;
@@ -463,7 +463,7 @@ net2280_alloc_buffer (
 	struct usb_ep		*_ep,
 	unsigned		bytes,
 	dma_addr_t		*dma,
-	int			gfp_flags
+	unsigned		gfp_flags
 )
 {
 	void			*retval;
@@ -897,7 +897,7 @@ done (struct net2280_ep *ep, struct net2280_request *req, int status)
 /*-------------------------------------------------------------------------*/
 
 static int
-net2280_queue (struct usb_ep *_ep, struct usb_request *_req, int gfp_flags)
+net2280_queue (struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags)
 {
 	struct net2280_request	*req;
 	struct net2280_ep	*ep;
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index a2b812af6e66..c906d675ef4b 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -269,7 +269,7 @@ static int omap_ep_disable(struct usb_ep *_ep)
 /*-------------------------------------------------------------------------*/
 
 static struct usb_request *
-omap_alloc_request(struct usb_ep *ep, int gfp_flags)
+omap_alloc_request(struct usb_ep *ep, unsigned gfp_flags)
 {
 	struct omap_req	*req;
 
@@ -298,7 +298,7 @@ omap_alloc_buffer(
 	struct usb_ep	*_ep,
 	unsigned	bytes,
 	dma_addr_t	*dma,
-	int		gfp_flags
+	unsigned	gfp_flags
 )
 {
 	void		*retval;
@@ -937,7 +937,7 @@ static void dma_channel_release(struct omap_ep *ep)
 /*-------------------------------------------------------------------------*/
 
 static int
-omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags)
+omap_ep_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags)
 {
 	struct omap_ep	*ep = container_of(_ep, struct omap_ep, ep);
 	struct omap_req	*req = container_of(_req, struct omap_req, req);
diff --git a/drivers/usb/gadget/pxa2xx_udc.c b/drivers/usb/gadget/pxa2xx_udc.c
index 6a0b957af335..1507738337c4 100644
--- a/drivers/usb/gadget/pxa2xx_udc.c
+++ b/drivers/usb/gadget/pxa2xx_udc.c
@@ -332,7 +332,7 @@ static int pxa2xx_ep_disable (struct usb_ep *_ep)
  * 	pxa2xx_ep_alloc_request - allocate a request data structure
  */
 static struct usb_request *
-pxa2xx_ep_alloc_request (struct usb_ep *_ep, int gfp_flags)
+pxa2xx_ep_alloc_request (struct usb_ep *_ep, unsigned gfp_flags)
 {
 	struct pxa2xx_request *req;
 
@@ -367,7 +367,7 @@ pxa2xx_ep_free_request (struct usb_ep *_ep, struct usb_request *_req)
  */
 static void *
 pxa2xx_ep_alloc_buffer(struct usb_ep *_ep, unsigned bytes,
-	dma_addr_t *dma, int gfp_flags)
+	dma_addr_t *dma, unsigned gfp_flags)
 {
 	char			*retval;
 
@@ -874,7 +874,7 @@ done:
 /*-------------------------------------------------------------------------*/
 
 static int
-pxa2xx_ep_queue(struct usb_ep *_ep, struct usb_request *_req, int gfp_flags)
+pxa2xx_ep_queue(struct usb_ep *_ep, struct usb_request *_req, unsigned gfp_flags)
 {
 	struct pxa2xx_request	*req;
 	struct pxa2xx_ep	*ep;
diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c
index a6e035e24479..bb9b2d94eed5 100644
--- a/drivers/usb/gadget/zero.c
+++ b/drivers/usb/gadget/zero.c
@@ -612,7 +612,7 @@ static void source_sink_complete (struct usb_ep *ep, struct usb_request *req)
 }
 
 static struct usb_request *
-source_sink_start_ep (struct usb_ep *ep, int gfp_flags)
+source_sink_start_ep (struct usb_ep *ep, unsigned gfp_flags)
 {
 	struct usb_request	*req;
 	int			status;
@@ -640,7 +640,7 @@ source_sink_start_ep (struct usb_ep *ep, int gfp_flags)
 }
 
 static int
-set_source_sink_config (struct zero_dev *dev, int gfp_flags)
+set_source_sink_config (struct zero_dev *dev, unsigned gfp_flags)
 {
 	int			result = 0;
 	struct usb_ep		*ep;
@@ -744,7 +744,7 @@ static void loopback_complete (struct usb_ep *ep, struct usb_request *req)
 }
 
 static int
-set_loopback_config (struct zero_dev *dev, int gfp_flags)
+set_loopback_config (struct zero_dev *dev, unsigned gfp_flags)
 {
 	int			result = 0;
 	struct usb_ep		*ep;
@@ -845,7 +845,7 @@ static void zero_reset_config (struct zero_dev *dev)
  * by limiting configuration choices (like the pxa2xx).
  */
 static int
-zero_set_config (struct zero_dev *dev, unsigned number, int gfp_flags)
+zero_set_config (struct zero_dev *dev, unsigned number, unsigned gfp_flags)
 {
 	int			result = 0;
 	struct usb_gadget	*gadget = dev->gadget;
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 35248a37b717..149b13fc0a71 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -960,7 +960,7 @@ static int ehci_urb_enqueue (
 	struct usb_hcd	*hcd,
 	struct usb_host_endpoint *ep,
 	struct urb	*urb,
-	int		mem_flags
+	unsigned	mem_flags
 ) {
 	struct ehci_hcd		*ehci = hcd_to_ehci (hcd);
 	struct list_head	qtd_list;
diff --git a/drivers/usb/host/ehci-q.c b/drivers/usb/host/ehci-q.c
index 45d89a7083b1..d74b2d68a50e 100644
--- a/drivers/usb/host/ehci-q.c
+++ b/drivers/usb/host/ehci-q.c
@@ -898,7 +898,7 @@ submit_async (
 	struct usb_host_endpoint *ep,
 	struct urb		*urb,
 	struct list_head	*qtd_list,
-	int			mem_flags
+	unsigned		mem_flags
 ) {
 	struct ehci_qtd		*qtd;
 	int			epnum;
diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c
index c2104cad4033..9af4f64532a9 100644
--- a/drivers/usb/host/ehci-sched.c
+++ b/drivers/usb/host/ehci-sched.c
@@ -588,7 +588,7 @@ static int intr_submit (
 	struct usb_host_endpoint *ep,
 	struct urb		*urb,
 	struct list_head	*qtd_list,
-	int			mem_flags
+	unsigned		mem_flags
 ) {
 	unsigned		epnum;
 	unsigned long		flags;
@@ -633,7 +633,7 @@ done:
 /* ehci_iso_stream ops work with both ITD and SITD */
 
 static struct ehci_iso_stream *
-iso_stream_alloc (int mem_flags)
+iso_stream_alloc (unsigned mem_flags)
 {
 	struct ehci_iso_stream *stream;
 
@@ -846,7 +846,7 @@ iso_stream_find (struct ehci_hcd *ehci, struct urb *urb)
 /* ehci_iso_sched ops can be ITD-only or SITD-only */
 
 static struct ehci_iso_sched *
-iso_sched_alloc (unsigned packets, int mem_flags)
+iso_sched_alloc (unsigned packets, unsigned mem_flags)
 {
 	struct ehci_iso_sched	*iso_sched;
 	int			size = sizeof *iso_sched;
@@ -919,7 +919,7 @@ itd_urb_transaction (
 	struct ehci_iso_stream	*stream,
 	struct ehci_hcd		*ehci,
 	struct urb		*urb,
-	int			mem_flags
+	unsigned		mem_flags
 )
 {
 	struct ehci_itd		*itd;
@@ -1412,7 +1412,8 @@ itd_complete (
 
 /*-------------------------------------------------------------------------*/
 
-static int itd_submit (struct ehci_hcd *ehci, struct urb *urb, int mem_flags)
+static int itd_submit (struct ehci_hcd *ehci, struct urb *urb,
+	unsigned mem_flags)
 {
 	int			status = -EINVAL;
 	unsigned long		flags;
@@ -1523,7 +1524,7 @@ sitd_urb_transaction (
 	struct ehci_iso_stream	*stream,
 	struct ehci_hcd		*ehci,
 	struct urb		*urb,
-	int			mem_flags
+	unsigned		mem_flags
 )
 {
 	struct ehci_sitd	*sitd;
@@ -1772,7 +1773,8 @@ sitd_complete (
 }
 
 
-static int sitd_submit (struct ehci_hcd *ehci, struct urb *urb, int mem_flags)
+static int sitd_submit (struct ehci_hcd *ehci, struct urb *urb,
+	unsigned mem_flags)
 {
 	int			status = -EINVAL;
 	unsigned long		flags;
@@ -1822,7 +1824,8 @@ done:
 #else
 
 static inline int
-sitd_submit (struct ehci_hcd *ehci, struct urb *urb, int mem_flags)
+sitd_submit (struct ehci_hcd *ehci, struct urb *urb,
+	unsigned mem_flags)
 {
 	ehci_dbg (ehci, "split iso support is disabled\n");
 	return -ENOSYS;
diff --git a/drivers/usb/host/hc_crisv10.c b/drivers/usb/host/hc_crisv10.c
index d9883d774d3a..81f8f6b7fdce 100644
--- a/drivers/usb/host/hc_crisv10.c
+++ b/drivers/usb/host/hc_crisv10.c
@@ -463,7 +463,8 @@ static void etrax_usb_free_epid(int epid);
 
 static int etrax_remove_from_sb_list(struct urb *urb);
 
-static void* etrax_usb_buffer_alloc(struct usb_bus* bus, size_t size, int mem_flags, dma_addr_t *dma);
+static void* etrax_usb_buffer_alloc(struct usb_bus* bus, size_t size,
+	unsigned mem_flags, dma_addr_t *dma);
 static void etrax_usb_buffer_free(struct usb_bus *bus, size_t size, void *addr, dma_addr_t dma);
 
 static void etrax_usb_add_to_bulk_sb_list(struct urb *urb, int epid);
@@ -476,7 +477,7 @@ static int etrax_usb_submit_ctrl_urb(struct urb *urb);
 static int etrax_usb_submit_intr_urb(struct urb *urb);
 static int etrax_usb_submit_isoc_urb(struct urb *urb);
 
-static int etrax_usb_submit_urb(struct urb *urb, int mem_flags);
+static int etrax_usb_submit_urb(struct urb *urb, unsigned mem_flags);
 static int etrax_usb_unlink_urb(struct urb *urb, int status);
 static int etrax_usb_get_frame_number(struct usb_device *usb_dev);
 
@@ -1262,7 +1263,7 @@ static int etrax_usb_allocate_epid(void)
 	return -1;
 }
 
-static int etrax_usb_submit_urb(struct urb *urb, int mem_flags)
+static int etrax_usb_submit_urb(struct urb *urb, unsigned mem_flags)
 {
 	etrax_hc_t *hc;
 	int ret = -EINVAL;
@@ -4277,7 +4278,8 @@ etrax_usb_bulk_eot_timer_func(unsigned long dummy)
 }
 
 static void*
-etrax_usb_buffer_alloc(struct usb_bus* bus, size_t size, int mem_flags, dma_addr_t *dma)
+etrax_usb_buffer_alloc(struct usb_bus* bus, size_t size,
+	unsigned mem_flags, dma_addr_t *dma)
 {
   return kmalloc(size, mem_flags);
 }
diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c
index 3f2cea21efc5..50b1970fe6b6 100644
--- a/drivers/usb/host/isp116x-hcd.c
+++ b/drivers/usb/host/isp116x-hcd.c
@@ -693,7 +693,7 @@ static int balance(struct isp116x *isp116x, u16 period, u16 load)
 
 static int isp116x_urb_enqueue(struct usb_hcd *hcd,
 			       struct usb_host_endpoint *hep, struct urb *urb,
-			       int mem_flags)
+			       unsigned mem_flags)
 {
 	struct isp116x *isp116x = hcd_to_isp116x(hcd);
 	struct usb_device *udev = urb->dev;
@@ -715,7 +715,7 @@ static int isp116x_urb_enqueue(struct usb_hcd *hcd,
 	}
 	/* avoid all allocations within spinlocks: request or endpoint */
 	if (!hep->hcpriv) {
-		ep = kcalloc(1, sizeof *ep, (__force unsigned)mem_flags);
+		ep = kcalloc(1, sizeof *ep, mem_flags);
 		if (!ep)
 			return -ENOMEM;
 	}
diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c
index 13cd2177b557..0375097850ee 100644
--- a/drivers/usb/host/ohci-hcd.c
+++ b/drivers/usb/host/ohci-hcd.c
@@ -180,7 +180,7 @@ static int ohci_urb_enqueue (
 	struct usb_hcd	*hcd,
 	struct usb_host_endpoint *ep,
 	struct urb	*urb,
-	int		mem_flags
+	unsigned	mem_flags
 ) {
 	struct ohci_hcd	*ohci = hcd_to_ohci (hcd);
 	struct ed	*ed;
diff --git a/drivers/usb/host/ohci-mem.c b/drivers/usb/host/ohci-mem.c
index 23735a36af00..fd3c4d3714bd 100644
--- a/drivers/usb/host/ohci-mem.c
+++ b/drivers/usb/host/ohci-mem.c
@@ -84,7 +84,7 @@ dma_to_td (struct ohci_hcd *hc, dma_addr_t td_dma)
 
 /* TDs ... */
 static struct td *
-td_alloc (struct ohci_hcd *hc, int mem_flags)
+td_alloc (struct ohci_hcd *hc, unsigned mem_flags)
 {
 	dma_addr_t	dma;
 	struct td	*td;
@@ -118,7 +118,7 @@ td_free (struct ohci_hcd *hc, struct td *td)
 
 /* EDs ... */
 static struct ed *
-ed_alloc (struct ohci_hcd *hc, int mem_flags)
+ed_alloc (struct ohci_hcd *hc, unsigned mem_flags)
 {
 	dma_addr_t	dma;
 	struct ed	*ed;
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index 6c3f910bc307..7a890a65f55d 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -815,7 +815,7 @@ static int sl811h_urb_enqueue(
 	struct usb_hcd		*hcd,
 	struct usb_host_endpoint *hep,
 	struct urb		*urb,
-	int			mem_flags
+	unsigned		mem_flags
 ) {
 	struct sl811		*sl811 = hcd_to_sl811(hcd);
 	struct usb_device	*udev = urb->dev;
diff --git a/drivers/usb/host/uhci-q.c b/drivers/usb/host/uhci-q.c
index 5f18084a116d..bbb36cd6ed61 100644
--- a/drivers/usb/host/uhci-q.c
+++ b/drivers/usb/host/uhci-q.c
@@ -1164,7 +1164,7 @@ static struct urb *uhci_find_urb_ep(struct uhci_hcd *uhci, struct urb *urb)
 
 static int uhci_urb_enqueue(struct usb_hcd *hcd,
 		struct usb_host_endpoint *ep,
-		struct urb *urb, int mem_flags)
+		struct urb *urb, unsigned mem_flags)
 {
 	int ret;
 	struct uhci_hcd *uhci = hcd_to_uhci(hcd);
diff --git a/drivers/usb/net/kaweth.c b/drivers/usb/net/kaweth.c
index fd6ff4cb2c62..7ffa99b9760f 100644
--- a/drivers/usb/net/kaweth.c
+++ b/drivers/usb/net/kaweth.c
@@ -477,7 +477,7 @@ static int kaweth_reset(struct kaweth_device *kaweth)
 }
 
 static void kaweth_usb_receive(struct urb *, struct pt_regs *regs);
-static int kaweth_resubmit_rx_urb(struct kaweth_device *, int);
+static int kaweth_resubmit_rx_urb(struct kaweth_device *, unsigned);
 
 /****************************************************************
 	int_callback
@@ -550,7 +550,7 @@ static void kaweth_resubmit_tl(void *d)
  *     kaweth_resubmit_rx_urb
  ****************************************************************/
 static int kaweth_resubmit_rx_urb(struct kaweth_device *kaweth,
-						int mem_flags)
+						unsigned mem_flags)
 {
 	int result;
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index eb282b581546..724637792996 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -938,17 +938,17 @@ static inline void usb_fill_int_urb (struct urb *urb,
 }
 
 extern void usb_init_urb(struct urb *urb);
-extern struct urb *usb_alloc_urb(int iso_packets, int mem_flags);
+extern struct urb *usb_alloc_urb(int iso_packets, unsigned mem_flags);
 extern void usb_free_urb(struct urb *urb);
 #define usb_put_urb usb_free_urb
 extern struct urb *usb_get_urb(struct urb *urb);
-extern int usb_submit_urb(struct urb *urb, int mem_flags);
+extern int usb_submit_urb(struct urb *urb, unsigned mem_flags);
 extern int usb_unlink_urb(struct urb *urb);
 extern void usb_kill_urb(struct urb *urb);
 
 #define HAVE_USB_BUFFERS
 void *usb_buffer_alloc (struct usb_device *dev, size_t size,
-	int mem_flags, dma_addr_t *dma);
+	unsigned mem_flags, dma_addr_t *dma);
 void usb_buffer_free (struct usb_device *dev, size_t size,
 	void *addr, dma_addr_t dma);
 
@@ -1055,7 +1055,7 @@ int usb_sg_init (
 	struct scatterlist	*sg,
 	int			nents,
 	size_t			length,
-	int			mem_flags
+	unsigned		mem_flags
 );
 void usb_sg_cancel (struct usb_sg_request *io);
 void usb_sg_wait (struct usb_sg_request *io);
diff --git a/include/linux/usb_gadget.h b/include/linux/usb_gadget.h
index b00f127cb447..71e608607324 100644
--- a/include/linux/usb_gadget.h
+++ b/include/linux/usb_gadget.h
@@ -107,18 +107,18 @@ struct usb_ep_ops {
 	int (*disable) (struct usb_ep *ep);
 
 	struct usb_request *(*alloc_request) (struct usb_ep *ep,
-		int gfp_flags);
+		unsigned gfp_flags);
 	void (*free_request) (struct usb_ep *ep, struct usb_request *req);
 
 	void *(*alloc_buffer) (struct usb_ep *ep, unsigned bytes,
-		dma_addr_t *dma, int gfp_flags);
+		dma_addr_t *dma, unsigned gfp_flags);
 	void (*free_buffer) (struct usb_ep *ep, void *buf, dma_addr_t dma,
 		unsigned bytes);
 	// NOTE:  on 2.6, drivers may also use dma_map() and
 	// dma_sync_single_*() to directly manage dma overhead. 
 
 	int (*queue) (struct usb_ep *ep, struct usb_request *req,
-		int gfp_flags);
+		unsigned gfp_flags);
 	int (*dequeue) (struct usb_ep *ep, struct usb_request *req);
 
 	int (*set_halt) (struct usb_ep *ep, int value);
@@ -214,7 +214,7 @@ usb_ep_disable (struct usb_ep *ep)
  * Returns the request, or null if one could not be allocated.
  */
 static inline struct usb_request *
-usb_ep_alloc_request (struct usb_ep *ep, int gfp_flags)
+usb_ep_alloc_request (struct usb_ep *ep, unsigned gfp_flags)
 {
 	return ep->ops->alloc_request (ep, gfp_flags);
 }
@@ -254,7 +254,7 @@ usb_ep_free_request (struct usb_ep *ep, struct usb_request *req)
  */
 static inline void *
 usb_ep_alloc_buffer (struct usb_ep *ep, unsigned len, dma_addr_t *dma,
-	int gfp_flags)
+	unsigned gfp_flags)
 {
 	return ep->ops->alloc_buffer (ep, len, dma, gfp_flags);
 }
@@ -330,7 +330,7 @@ usb_ep_free_buffer (struct usb_ep *ep, void *buf, dma_addr_t dma, unsigned len)
  * reported when the usb peripheral is disconnected.
  */
 static inline int
-usb_ep_queue (struct usb_ep *ep, struct usb_request *req, int gfp_flags)
+usb_ep_queue (struct usb_ep *ep, struct usb_request *req, unsigned gfp_flags)
 {
 	return ep->ops->queue (ep, req, gfp_flags);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 00ab997dd24fff82900665449f859e23a78ad5f4 Mon Sep 17 00:00:00 2001
From: "david-b@pacbell.net" <david-b@pacbell.net>
Date: Wed, 29 Jun 2005 07:04:14 -0700
Subject: [PATCH] USB: another cdc descriptor

This adds another CDC descriptor type to <linux/usb_cdc.h>; the main claim
to fame for this is that some Motorola phones include it.  It's not currently
needed by any driver code; included for completeness.

Signed-off-by: David Brownell <dbrownell@users.sourceforge.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/usb_cdc.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb_cdc.h b/include/linux/usb_cdc.h
index f22d6beecc73..ba617c372455 100644
--- a/include/linux/usb_cdc.h
+++ b/include/linux/usb_cdc.h
@@ -34,6 +34,7 @@
 #define USB_CDC_ACM_TYPE		0x02		/* acm_descriptor */
 #define USB_CDC_UNION_TYPE		0x06		/* union_desc */
 #define USB_CDC_COUNTRY_TYPE		0x07
+#define USB_CDC_NETWORK_TERMINAL_TYPE	0x0a		/* network_terminal_desc */
 #define USB_CDC_ETHERNET_TYPE		0x0f		/* ether_desc */
 #define USB_CDC_WHCM_TYPE		0x11
 #define USB_CDC_MDLM_TYPE		0x12		/* mdlm_desc */
@@ -83,6 +84,18 @@ struct usb_cdc_union_desc {
 	/* ... and there could be other slave interfaces */
 } __attribute__ ((packed));
 
+/* "Network Channel Terminal Functional Descriptor" from CDC spec 5.2.3.11 */
+struct usb_cdc_network_terminal_desc {
+	__u8	bLength;
+	__u8	bDescriptorType;
+	__u8	bDescriptorSubType;
+
+	__u8	bEntityId;
+	__u8	iName;
+	__u8	bChannelIndex;
+	__u8	bPhysicalInterface;
+} __attribute__ ((packed));
+
 /* "Ethernet Networking Functional Descriptor" from CDC spec 5.2.3.16 */
 struct usb_cdc_ether_desc {
 	__u8	bLength;
-- 
cgit v1.2.3-59-g8ed1b


From ab611487d8ada506e511d2b8f22fb8e7be9939b9 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 12 Jul 2005 12:08:43 -0700
Subject: [NET]: __be'ify *_type_trans()

tr_type_trans(), hippi_type_trans() left as-is.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/myri_sbus.c      | 2 +-
 drivers/net/plip.c           | 2 +-
 drivers/net/wan/farsync.c    | 3 +--
 drivers/net/wan/hdlc_cisco.c | 3 +--
 drivers/net/wan/hdlc_ppp.c   | 3 +--
 drivers/net/wan/hdlc_raw.c   | 3 +--
 drivers/s390/net/qeth_main.c | 2 +-
 include/linux/etherdevice.h  | 2 +-
 include/linux/fddidevice.h   | 2 +-
 include/linux/hdlc.h         | 4 ++--
 include/linux/wanrouter.h    | 3 +--
 include/net/x25device.h      | 3 +--
 net/802/fddi.c               | 4 ++--
 net/atm/br2684.c             | 3 +--
 net/ethernet/eth.c           | 2 +-
 net/wanrouter/wanmain.c      | 6 +++---
 16 files changed, 20 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/myri_sbus.c b/drivers/net/myri_sbus.c
index aad5494c83cf..f0996ce5c268 100644
--- a/drivers/net/myri_sbus.c
+++ b/drivers/net/myri_sbus.c
@@ -369,7 +369,7 @@ static void myri_tx(struct myri_eth *mp, struct net_device *dev)
  * assume 802.3 if the type field is short enough to be a length.
  * This is normal practice and works for any 'now in use' protocol.
  */
-static unsigned short myri_type_trans(struct sk_buff *skb, struct net_device *dev)
+static __be16 myri_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/drivers/net/plip.c b/drivers/net/plip.c
index f4b62405d2e5..21537ee3a6a7 100644
--- a/drivers/net/plip.c
+++ b/drivers/net/plip.c
@@ -540,7 +540,7 @@ plip_receive(unsigned short nibble_timeout, struct net_device *dev,
  *	in far too many old systems not all even running Linux.
  */
  
-static unsigned short plip_type_trans(struct sk_buff *skb, struct net_device *dev)
+static __be16 plip_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 7217d44e8854..2c83cca34b86 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -861,8 +861,7 @@ fst_tx_dma_complete(struct fst_card_info *card, struct fst_port_info *port,
 /*
  * Mark it for our own raw sockets interface
  */
-static unsigned short farsync_type_trans(struct sk_buff *skb,
-					 struct net_device *dev)
+static __be16 farsync_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	skb->dev = dev;
 	skb->mac.raw = skb->data;
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index 87496843681a..48c03c11cd9a 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -91,8 +91,7 @@ static void cisco_keepalive_send(struct net_device *dev, u32 type,
 
 
-static unsigned short cisco_type_trans(struct sk_buff *skb,
-				       struct net_device *dev)
+static __be16 cisco_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	hdlc_header *data = (hdlc_header*)skb->data;
 
diff --git a/drivers/net/wan/hdlc_ppp.c b/drivers/net/wan/hdlc_ppp.c
index 7cd6195a2e46..b81263eaede0 100644
--- a/drivers/net/wan/hdlc_ppp.c
+++ b/drivers/net/wan/hdlc_ppp.c
@@ -66,8 +66,7 @@ static void ppp_close(struct net_device *dev)
 
 
-static unsigned short ppp_type_trans(struct sk_buff *skb,
-				     struct net_device *dev)
+static __be16 ppp_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	return __constant_htons(ETH_P_WAN_PPP);
 }
diff --git a/drivers/net/wan/hdlc_raw.c b/drivers/net/wan/hdlc_raw.c
index c41fb70b6929..9456d31cb1c1 100644
--- a/drivers/net/wan/hdlc_raw.c
+++ b/drivers/net/wan/hdlc_raw.c
@@ -24,8 +24,7 @@
 #include <linux/hdlc.h>
 
 
-static unsigned short raw_type_trans(struct sk_buff *skb,
-				     struct net_device *dev)
+static __be16 raw_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	return __constant_htons(ETH_P_IP);
 }
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
index 3cb88c770037..8f4d2999af8e 100644
--- a/drivers/s390/net/qeth_main.c
+++ b/drivers/s390/net/qeth_main.c
@@ -2210,7 +2210,7 @@ no_mem:
 	return NULL;
 }
 
-static inline unsigned short
+static inline __be16
 qeth_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct qeth_card *card;
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index cf3847edc50f..ce8518e658b6 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -33,7 +33,7 @@ extern int		eth_header(struct sk_buff *skb, struct net_device *dev,
 				   unsigned short type, void *daddr,
 				   void *saddr, unsigned len);
 extern int		eth_rebuild_header(struct sk_buff *skb);
-extern unsigned short	eth_type_trans(struct sk_buff *skb, struct net_device *dev);
+extern __be16		eth_type_trans(struct sk_buff *skb, struct net_device *dev);
 extern void		eth_header_cache_update(struct hh_cache *hh, struct net_device *dev,
 						unsigned char * haddr);
 extern int		eth_header_cache(struct neighbour *neigh,
diff --git a/include/linux/fddidevice.h b/include/linux/fddidevice.h
index 002f6367697d..e61e42dfd317 100644
--- a/include/linux/fddidevice.h
+++ b/include/linux/fddidevice.h
@@ -25,7 +25,7 @@
 #include <linux/if_fddi.h>
 
 #ifdef __KERNEL__
-extern unsigned short	fddi_type_trans(struct sk_buff *skb,
+extern __be16	fddi_type_trans(struct sk_buff *skb,
 				struct net_device *dev);
 extern struct net_device *alloc_fddidev(int sizeof_priv);
 #endif
diff --git a/include/linux/hdlc.h b/include/linux/hdlc.h
index ed2927ef1ff7..df695e9ae327 100644
--- a/include/linux/hdlc.h
+++ b/include/linux/hdlc.h
@@ -242,8 +242,8 @@ static __inline__ struct net_device_stats *hdlc_stats(struct net_device *dev)
 }
 
 
-static __inline__ unsigned short hdlc_type_trans(struct sk_buff *skb,
-						 struct net_device *dev)
+static __inline__ __be16 hdlc_type_trans(struct sk_buff *skb,
+					 struct net_device *dev)
 {
 	hdlc_device *hdlc = dev_to_hdlc(dev);
 
diff --git a/include/linux/wanrouter.h b/include/linux/wanrouter.h
index 3e89f0f15f49..1b6b76a4eb54 100644
--- a/include/linux/wanrouter.h
+++ b/include/linux/wanrouter.h
@@ -516,8 +516,7 @@ struct wan_device {
 /* Public functions available for device drivers */
 extern int register_wan_device(struct wan_device *wandev);
 extern int unregister_wan_device(char *name);
-unsigned short wanrouter_type_trans(struct sk_buff *skb,
-				    struct net_device *dev);
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev);
 int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
 			  unsigned short type);
 
diff --git a/include/net/x25device.h b/include/net/x25device.h
index cf36a20ea3c5..d45ae883bd1d 100644
--- a/include/net/x25device.h
+++ b/include/net/x25device.h
@@ -5,8 +5,7 @@
 #include <linux/if_packet.h>
 #include <linux/skbuff.h>
 
-static inline unsigned short x25_type_trans(struct sk_buff *skb,
-					    struct net_device *dev)
+static inline __be16 x25_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	skb->mac.raw = skb->data;
 	skb->input_dev = skb->dev = dev;
diff --git a/net/802/fddi.c b/net/802/fddi.c
index ebcf4830d6f1..5ce24c4bb840 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -122,10 +122,10 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
  * the proper pointer to the start of packet data (skb->data).
  */
  
-unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct fddihdr *fddi = (struct fddihdr *)skb->data;
-	unsigned short type;
+	__be16 type;
 	
 	/*
 	 * Set mac.raw field to point to FC byte, set data field to point
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index e6954cf1459d..289956c4dd3e 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -289,8 +289,7 @@ xmit will add the additional header part in that case */
  * This is similar to eth_type_trans, which cannot be used because of
  * our dev->hard_header_len
  */
-static inline unsigned short br_type_trans(struct sk_buff *skb,
-					       struct net_device *dev)
+static inline __be16 br_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index ab60ea63688e..f6dbfb99b14d 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -155,7 +155,7 @@ int eth_rebuild_header(struct sk_buff *skb)
  *	This is normal practice and works for any 'now in use' protocol.
  */
  
-unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ethhdr *eth;
 	unsigned char *rawp;
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index d6844ac226f5..13b650ad22e2 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -358,10 +358,10 @@ int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev,
  */
 
 
-unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
+__be16 wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
 {
 	int cnt = skb->data[0] ? 0 : 1;	/* there may be a pad present */
-	unsigned short ethertype;
+	__be16 ethertype;
 
 	switch (skb->data[cnt]) {
 	case NLPID_IP:		/* IP datagramm */
@@ -379,7 +379,7 @@ unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev)
 				skb->data[cnt+3], dev->name);
 			return 0;
 		}
-		ethertype = *((unsigned short*)&skb->data[cnt+4]);
+		ethertype = *((__be16*)&skb->data[cnt+4]);
 		cnt += 6;
 		break;
 
-- 
cgit v1.2.3-59-g8ed1b


From d53d9f16ea95a91ad4aa114809dcde486ca4000d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 12 Jul 2005 13:58:07 -0700
Subject: [PATCH] name_to_dev_t warning fix

kernel/power/disk.c needs a declaration of name_to_dev_t() in scope.  mount.h
seems like an appropriate choice.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mount.h | 2 ++
 init/do_mounts.c      | 1 +
 init/do_mounts.h      | 1 -
 kernel/power/disk.c   | 2 ++
 kernel/power/swsusp.c | 3 +--
 5 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mount.h b/include/linux/mount.h
index 74b4727a4e30..f8f39937e301 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -12,6 +12,7 @@
 #define _LINUX_MOUNT_H
 #ifdef __KERNEL__
 
+#include <linux/types.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <asm/atomic.h>
@@ -76,6 +77,7 @@ extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
 extern void mark_mounts_for_expiry(struct list_head *mounts);
 
 extern spinlock_t vfsmount_lock;
+extern dev_t name_to_dev_t(char *name);
 
 #endif
 #endif /* _LINUX_MOUNT_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index b7570c074d0f..1b02be734ccc 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -7,6 +7,7 @@
 #include <linux/root_dev.h>
 #include <linux/security.h>
 #include <linux/delay.h>
+#include <linux/mount.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
diff --git a/init/do_mounts.h b/init/do_mounts.h
index de92bee4f35e..e0a7ac9649e1 100644
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -9,7 +9,6 @@
 #include <linux/major.h>
 #include <linux/root_dev.h>
 
-dev_t name_to_dev_t(char *name);
 void  change_floppy(char *fmt, ...);
 void  mount_block_root(char *name, int flags);
 void  mount_root(void);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index c51a4d96d4eb..3ec789c6b537 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -16,6 +16,8 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
+#include <linux/mount.h>
+
 #include "power.h"
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 7d7801cd01f0..f2bc71b9fe8b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -63,6 +63,7 @@
 #include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
+#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1260,8 +1261,6 @@ static int data_read(struct pbe *pblist)
 	return error;
 }
 
-extern dev_t name_to_dev_t(const char *line);
-
 /**
  *	read_pagedir - Read page backup list pages from swap
  */
-- 
cgit v1.2.3-59-g8ed1b


From 08c6a96fd77836856c090ebb39beadc81cb8484d Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 12 Jul 2005 13:58:28 -0700
Subject: [PATCH] ext3: fix options parsing

Fix a problem with ext3 mount option parsing.  When remount of a filesystem
fails, old options are now restored.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/super.c         | 70 ++++++++++++++++++++++++++++++++++++++++++-------
 include/linux/ext3_fs.h | 14 ++++++++++
 2 files changed, 74 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index a6d1779d7de4..3c3c6e399fb3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -890,7 +890,10 @@ clear_qf_name:
 					"quota turned on.\n");
 				return 0;
 			}
-			kfree(sbi->s_qf_names[qtype]);
+			/*
+			 * The space will be released later when all options
+			 * are confirmed to be correct
+			 */
 			sbi->s_qf_names[qtype] = NULL;
 			break;
 		case Opt_jqfmt_vfsold:
@@ -939,7 +942,7 @@ clear_qf_name:
 		case Opt_ignore:
 			break;
 		case Opt_resize:
-			if (!n_blocks_count) {
+			if (!is_remount) {
 				printk("EXT3-fs: resize option only available "
 					"for remount\n");
 				return 0;
@@ -2109,14 +2112,33 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 {
 	struct ext3_super_block * es;
 	struct ext3_sb_info *sbi = EXT3_SB(sb);
-	unsigned long tmp;
 	unsigned long n_blocks_count = 0;
+	unsigned long old_sb_flags;
+	struct ext3_mount_options old_opts;
+	int err;
+#ifdef CONFIG_QUOTA
+	int i;
+#endif
+
+	/* Store the original options */
+	old_sb_flags = sb->s_flags;
+	old_opts.s_mount_opt = sbi->s_mount_opt;
+	old_opts.s_resuid = sbi->s_resuid;
+	old_opts.s_resgid = sbi->s_resgid;
+	old_opts.s_commit_interval = sbi->s_commit_interval;
+#ifdef CONFIG_QUOTA
+	old_opts.s_jquota_fmt = sbi->s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++)
+		old_opts.s_qf_names[i] = sbi->s_qf_names[i];
+#endif
 
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, &tmp, &n_blocks_count, 1))
-		return -EINVAL;
+	if (!parse_options(data, sb, NULL, &n_blocks_count, 1)) {
+		err = -EINVAL;
+		goto restore_opts;
+	}
 
 	if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
 		ext3_abort(sb, __FUNCTION__, "Abort forced by user");
@@ -2130,8 +2152,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 
 	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
 		n_blocks_count > le32_to_cpu(es->s_blocks_count)) {
-		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT)
-			return -EROFS;
+		if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) {
+			err = -EROFS;
+			goto restore_opts;
+		}
 
 		if (*flags & MS_RDONLY) {
 			/*
@@ -2158,7 +2182,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 				       "remount RDWR because of unsupported "
 				       "optional features (%x).\n",
 				       sb->s_id, le32_to_cpu(ret));
-				return -EROFS;
+				err = -EROFS;
+				goto restore_opts;
 			}
 			/*
 			 * Mounting a RDONLY partition read-write, so reread
@@ -2168,13 +2193,38 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			 */
 			ext3_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
-			if ((ret = ext3_group_extend(sb, es, n_blocks_count)))
-				return ret;
+			if ((ret = ext3_group_extend(sb, es, n_blocks_count))) {
+				err = ret;
+				goto restore_opts;
+			}
 			if (!ext3_setup_super (sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+#ifdef CONFIG_QUOTA
+	/* Release old quota file names */
+	for (i = 0; i < MAXQUOTAS; i++)
+		if (old_opts.s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(old_opts.s_qf_names[i]);
+#endif
 	return 0;
+restore_opts:
+	sb->s_flags = old_sb_flags;
+	sbi->s_mount_opt = old_opts.s_mount_opt;
+	sbi->s_resuid = old_opts.s_resuid;
+	sbi->s_resgid = old_opts.s_resgid;
+	sbi->s_commit_interval = old_opts.s_commit_interval;
+#ifdef CONFIG_QUOTA
+	sbi->s_jquota_fmt = old_opts.s_jquota_fmt;
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sbi->s_qf_names[i] &&
+		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
+			kfree(sbi->s_qf_names[i]);
+		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
+	}
+#endif
+	return err;
 }
 
 static int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 4b6e1ab216a5..c16662836c58 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -238,6 +238,20 @@ struct ext3_new_group_data {
 #define EXT3_IOC_GETRSVSZ		_IOR('f', 5, long)
 #define EXT3_IOC_SETRSVSZ		_IOW('f', 6, long)
 
+/*
+ *  Mount options
+ */
+struct ext3_mount_options {
+	unsigned long s_mount_opt;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned long s_commit_interval;
+#ifdef CONFIG_QUOTA
+	int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+#endif
+};
+
 /*
  * Structure of an inode on the disk
  */
-- 
cgit v1.2.3-59-g8ed1b


From 542d1c88bd7f73e2e59d41b12e4a9041deea89e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 12 Jul 2005 13:58:31 -0700
Subject: [PATCH] tlb.h warning fix

free_pages_and_swap_cache() and free_page_and_swap_cache() use release_pages()
and page_cache_release() respectively, so make sure that we have the
declarations in scope.

Cc: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c75954f2d868..239f520cc49e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,6 +7,8 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/pagemap.h>
+
 #include <asm/atomic.h>
 #include <asm/page.h>
 
-- 
cgit v1.2.3-59-g8ed1b


From 67bc4eb0b1140a4bf364f2dcca152be659ed9057 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 12 Jul 2005 13:58:36 -0700
Subject: [PATCH] hardirq uses preempt

hardirq.h uses preempt_count() from preempt.h

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hardirq.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 8336dba18971..5912874ca83c 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -2,6 +2,7 @@
 #define LINUX_HARDIRQ_H
 
 #include <linux/config.h>
+#include <linux/preempt.h>
 #include <linux/smp_lock.h>
 #include <asm/hardirq.h>
 #include <asm/system.h>
-- 
cgit v1.2.3-59-g8ed1b


From bd4c625c061c2a38568d0add3478f59172455159 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 12 Jul 2005 20:21:28 -0700
Subject: reiserfs: run scripts/Lindent on reiserfs code

This was a pure indentation change, using:

	scripts/Lindent fs/reiserfs/*.c include/linux/reiserfs_*.h

to make reiserfs match the regular Linux indentation style.  As Jeff
Mahoney <jeffm@suse.com> writes:

 The ReiserFS code is a mix of a number of different coding styles, sometimes
 different even from line-to-line. Since the code has been relatively stable
 for quite some time and there are few outstanding patches to be applied, it
 is time to reformat the code to conform to the Linux style standard outlined
 in Documentation/CodingStyle.

 This patch contains the result of running scripts/Lindent against
 fs/reiserfs/*.c and include/linux/reiserfs_*.h. There are places where the
 code can be made to look better, but I'd rather keep those patches separate
 so that there isn't a subtle by-hand hand accident in the middle of a huge
 patch. To be clear: This patch is reformatting *only*.

 A number of patches may follow that continue to make the code more consistent
 with the Linux coding style.

 Hans wasn't particularly enthusiastic about these patches, but said he
 wouldn't really oppose them either.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/reiserfs/bitmap.c           | 1842 ++++++-----
 fs/reiserfs/dir.c              |  488 +--
 fs/reiserfs/do_balan.c         | 3236 +++++++++++--------
 fs/reiserfs/file.c             | 2564 ++++++++-------
 fs/reiserfs/fix_node.c         | 4051 +++++++++++------------
 fs/reiserfs/hashes.c           |  193 +-
 fs/reiserfs/ibalance.c         | 1844 +++++------
 fs/reiserfs/inode.c            | 4915 ++++++++++++++--------------
 fs/reiserfs/ioctl.c            |  197 +-
 fs/reiserfs/item_ops.c         |  977 +++---
 fs/reiserfs/journal.c          | 6891 +++++++++++++++++++++-------------------
 fs/reiserfs/lbalance.c         | 2218 ++++++-------
 fs/reiserfs/namei.c            | 2574 +++++++--------
 fs/reiserfs/objectid.c         |  303 +-
 fs/reiserfs/prints.c           | 1003 +++---
 fs/reiserfs/procfs.c           |  695 ++--
 fs/reiserfs/resize.c           |  207 +-
 fs/reiserfs/stree.c            | 3369 ++++++++++----------
 fs/reiserfs/super.c            | 3623 +++++++++++----------
 fs/reiserfs/tail_conversion.c  |  463 +--
 fs/reiserfs/xattr.c            | 2173 ++++++-------
 fs/reiserfs/xattr_acl.c        |  641 ++--
 fs/reiserfs/xattr_security.c   |   54 +-
 fs/reiserfs/xattr_trusted.c    |   70 +-
 fs/reiserfs/xattr_user.c       |   89 +-
 include/linux/reiserfs_acl.h   |   52 +-
 include/linux/reiserfs_fs.h    | 1595 +++++-----
 include/linux/reiserfs_fs_i.h  |   59 +-
 include/linux/reiserfs_fs_sb.h |  616 ++--
 include/linux/reiserfs_xattr.h |  126 +-
 30 files changed, 24447 insertions(+), 22681 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index 49c479c9454a..909f71e9a30f 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -46,1125 +46,1221 @@
 #define TEST_OPTION(optname, s) \
     test_bit(_ALLOC_ ## optname , &SB_ALLOC_OPTS(s))
 
-static inline void get_bit_address (struct super_block * s,
-				    b_blocknr_t block, int * bmap_nr, int * offset)
+static inline void get_bit_address(struct super_block *s,
+				   b_blocknr_t block, int *bmap_nr, int *offset)
 {
-    /* It is in the bitmap block number equal to the block
-     * number divided by the number of bits in a block. */
-    *bmap_nr = block / (s->s_blocksize << 3);
-    /* Within that bitmap block it is located at bit offset *offset. */
-    *offset = block & ((s->s_blocksize << 3) - 1 );
-    return;
+	/* It is in the bitmap block number equal to the block
+	 * number divided by the number of bits in a block. */
+	*bmap_nr = block / (s->s_blocksize << 3);
+	/* Within that bitmap block it is located at bit offset *offset. */
+	*offset = block & ((s->s_blocksize << 3) - 1);
+	return;
 }
 
 #ifdef CONFIG_REISERFS_CHECK
-int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value)
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value)
 {
-    int i, j;
+	int i, j;
 
-    if (block == 0 || block >= SB_BLOCK_COUNT (s)) {
-	reiserfs_warning (s, "vs-4010: is_reusable: block number is out of range %lu (%u)",
-			  block, SB_BLOCK_COUNT (s));
-	return 0;
-    }
-
-    /* it can't be one of the bitmap blocks */
-    for (i = 0; i < SB_BMAP_NR (s); i ++)
-	if (block == SB_AP_BITMAP (s)[i].bh->b_blocknr) {
-	    reiserfs_warning (s, "vs: 4020: is_reusable: "
-			      "bitmap block %lu(%u) can't be freed or reused",
-			      block, SB_BMAP_NR (s));
-	    return 0;
+	if (block == 0 || block >= SB_BLOCK_COUNT(s)) {
+		reiserfs_warning(s,
+				 "vs-4010: is_reusable: block number is out of range %lu (%u)",
+				 block, SB_BLOCK_COUNT(s));
+		return 0;
 	}
-  
-    get_bit_address (s, block, &i, &j);
 
-    if (i >= SB_BMAP_NR (s)) {
-	reiserfs_warning (s, "vs-4030: is_reusable: there is no so many bitmap blocks: "
-			  "block=%lu, bitmap_nr=%d", block, i);
-	return 0;
-    }
+	/* it can't be one of the bitmap blocks */
+	for (i = 0; i < SB_BMAP_NR(s); i++)
+		if (block == SB_AP_BITMAP(s)[i].bh->b_blocknr) {
+			reiserfs_warning(s, "vs: 4020: is_reusable: "
+					 "bitmap block %lu(%u) can't be freed or reused",
+					 block, SB_BMAP_NR(s));
+			return 0;
+		}
 
-    if ((bit_value == 0 && 
-         reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
-	(bit_value == 1 && 
-	 reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i].bh->b_data) == 0)) {
-	reiserfs_warning (s, "vs-4040: is_reusable: corresponding bit of block %lu does not "
-			  "match required value (i==%d, j==%d) test_bit==%d",
-		block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i].bh->b_data));
+	get_bit_address(s, block, &i, &j);
 
-	return 0;
-    }
+	if (i >= SB_BMAP_NR(s)) {
+		reiserfs_warning(s,
+				 "vs-4030: is_reusable: there is no so many bitmap blocks: "
+				 "block=%lu, bitmap_nr=%d", block, i);
+		return 0;
+	}
 
-    if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) {
-	reiserfs_warning (s, "vs-4050: is_reusable: this is root block (%u), "
-			  "it must be busy", SB_ROOT_BLOCK (s));
-	return 0;
-    }
+	if ((bit_value == 0 &&
+	     reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data)) ||
+	    (bit_value == 1 &&
+	     reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i].bh->b_data) == 0)) {
+		reiserfs_warning(s,
+				 "vs-4040: is_reusable: corresponding bit of block %lu does not "
+				 "match required value (i==%d, j==%d) test_bit==%d",
+				 block, i, j, reiserfs_test_le_bit(j,
+								   SB_AP_BITMAP
+								   (s)[i].bh->
+								   b_data));
+
+		return 0;
+	}
 
-    return 1;
+	if (bit_value == 0 && block == SB_ROOT_BLOCK(s)) {
+		reiserfs_warning(s,
+				 "vs-4050: is_reusable: this is root block (%u), "
+				 "it must be busy", SB_ROOT_BLOCK(s));
+		return 0;
+	}
+
+	return 1;
 }
-#endif /* CONFIG_REISERFS_CHECK */
+#endif				/* CONFIG_REISERFS_CHECK */
 
 /* searches in journal structures for a given block number (bmap, off). If block
    is found in reiserfs journal it suggests next free block candidate to test. */
-static inline  int is_block_in_journal (struct super_block * s, int bmap, int
-off, int *next)
+static inline int is_block_in_journal(struct super_block *s, int bmap, int
+				      off, int *next)
 {
-    b_blocknr_t tmp;
-
-    if (reiserfs_in_journal (s, bmap, off, 1, &tmp)) {
-	if (tmp) {              /* hint supplied */
-	    *next = tmp;
-	    PROC_INFO_INC( s, scan_bitmap.in_journal_hint );
-	} else {
-	    (*next) = off + 1;          /* inc offset to avoid looping. */
-	    PROC_INFO_INC( s, scan_bitmap.in_journal_nohint );
+	b_blocknr_t tmp;
+
+	if (reiserfs_in_journal(s, bmap, off, 1, &tmp)) {
+		if (tmp) {	/* hint supplied */
+			*next = tmp;
+			PROC_INFO_INC(s, scan_bitmap.in_journal_hint);
+		} else {
+			(*next) = off + 1;	/* inc offset to avoid looping. */
+			PROC_INFO_INC(s, scan_bitmap.in_journal_nohint);
+		}
+		PROC_INFO_INC(s, scan_bitmap.retry);
+		return 1;
 	}
-	PROC_INFO_INC( s, scan_bitmap.retry );
-	return 1;
-    }
-    return 0;
+	return 0;
 }
 
 /* it searches for a window of zero bits with given minimum and maximum lengths in one bitmap
  * block; */
-static int scan_bitmap_block (struct reiserfs_transaction_handle *th,
-			      int bmap_n, int *beg, int boundary, int min, int max, int unfm)
+static int scan_bitmap_block(struct reiserfs_transaction_handle *th,
+			     int bmap_n, int *beg, int boundary, int min,
+			     int max, int unfm)
 {
-    struct super_block *s = th->t_super;
-    struct reiserfs_bitmap_info *bi=&SB_AP_BITMAP(s)[bmap_n];
-    int end, next;
-    int org = *beg;
+	struct super_block *s = th->t_super;
+	struct reiserfs_bitmap_info *bi = &SB_AP_BITMAP(s)[bmap_n];
+	int end, next;
+	int org = *beg;
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    RFALSE(bmap_n >= SB_BMAP_NR (s), "Bitmap %d is out of range (0..%d)",bmap_n, SB_BMAP_NR (s) - 1);
-    PROC_INFO_INC( s, scan_bitmap.bmap );
+	RFALSE(bmap_n >= SB_BMAP_NR(s), "Bitmap %d is out of range (0..%d)",
+	       bmap_n, SB_BMAP_NR(s) - 1);
+	PROC_INFO_INC(s, scan_bitmap.bmap);
 /* this is unclear and lacks comments, explain how journal bitmaps
    work here for the reader.  Convey a sense of the design here. What
    is a window? */
 /* - I mean `a window of zero bits' as in description of this function - Zam. */
-  
-    if ( !bi ) {
-	reiserfs_warning (s, "NULL bitmap info pointer for bitmap %d", bmap_n);
-	return 0;
-    }
-    if (buffer_locked (bi->bh)) {
-       PROC_INFO_INC( s, scan_bitmap.wait );
-       __wait_on_buffer (bi->bh);
-    }
-
-    while (1) {
-	cont:
-	if (bi->free_count < min)
-		return 0; // No free blocks in this bitmap
-
-	/* search for a first zero bit -- beggining of a window */
-	*beg = reiserfs_find_next_zero_le_bit
-	        ((unsigned long*)(bi->bh->b_data), boundary, *beg);
-  
-	if (*beg + min > boundary) { /* search for a zero bit fails or the rest of bitmap block
-				      * cannot contain a zero window of minimum size */
-	    return 0;
-	}
 
-	if (unfm && is_block_in_journal(s,bmap_n, *beg, beg))
-	    continue;
-	/* first zero bit found; we check next bits */
-	for (end = *beg + 1;; end ++) {
-	    if (end >= *beg + max || end >= boundary || reiserfs_test_le_bit (end, bi->bh->b_data)) {
-		next = end;
-		break;
-	    }
-	    /* finding the other end of zero bit window requires looking into journal structures (in
-	     * case of searching for free blocks for unformatted nodes) */
-	    if (unfm && is_block_in_journal(s, bmap_n, end, &next))
-		break;
+	if (!bi) {
+		reiserfs_warning(s, "NULL bitmap info pointer for bitmap %d",
+				 bmap_n);
+		return 0;
+	}
+	if (buffer_locked(bi->bh)) {
+		PROC_INFO_INC(s, scan_bitmap.wait);
+		__wait_on_buffer(bi->bh);
 	}
 
-	/* now (*beg) points to beginning of zero bits window,
-	 * (end) points to one bit after the window end */
-	if (end - *beg >= min) { /* it seems we have found window of proper size */
-	    int i;
-	    reiserfs_prepare_for_journal (s, bi->bh, 1);
-	    /* try to set all blocks used checking are they still free */
-	    for (i = *beg; i < end; i++) {
-		/* It seems that we should not check in journal again. */
-		if (reiserfs_test_and_set_le_bit (i, bi->bh->b_data)) {
-		    /* bit was set by another process
-		     * while we slept in prepare_for_journal() */
-		    PROC_INFO_INC( s, scan_bitmap.stolen );
-		    if (i >= *beg + min)	{ /* we can continue with smaller set of allocated blocks,
-					   * if length of this set is more or equal to `min' */
-			end = i;
-			break;
-		    }
-		    /* otherwise we clear all bit were set ... */
-		    while (--i >= *beg)
-			reiserfs_test_and_clear_le_bit (i, bi->bh->b_data);
-		    reiserfs_restore_prepared_buffer (s, bi->bh);
-		    *beg = org;
-		    /* ... and search again in current block from beginning */
-		    goto cont;	
+	while (1) {
+	      cont:
+		if (bi->free_count < min)
+			return 0;	// No free blocks in this bitmap
+
+		/* search for a first zero bit -- beggining of a window */
+		*beg = reiserfs_find_next_zero_le_bit
+		    ((unsigned long *)(bi->bh->b_data), boundary, *beg);
+
+		if (*beg + min > boundary) {	/* search for a zero bit fails or the rest of bitmap block
+						 * cannot contain a zero window of minimum size */
+			return 0;
 		}
-	    }
-	    bi->free_count -= (end - *beg);
-	    journal_mark_dirty (th, s, bi->bh);
 
-	    /* free block count calculation */
-	    reiserfs_prepare_for_journal (s, SB_BUFFER_WITH_SB(s), 1);
-	    PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
-	    journal_mark_dirty (th, s, SB_BUFFER_WITH_SB(s));
+		if (unfm && is_block_in_journal(s, bmap_n, *beg, beg))
+			continue;
+		/* first zero bit found; we check next bits */
+		for (end = *beg + 1;; end++) {
+			if (end >= *beg + max || end >= boundary
+			    || reiserfs_test_le_bit(end, bi->bh->b_data)) {
+				next = end;
+				break;
+			}
+			/* finding the other end of zero bit window requires looking into journal structures (in
+			 * case of searching for free blocks for unformatted nodes) */
+			if (unfm && is_block_in_journal(s, bmap_n, end, &next))
+				break;
+		}
 
-	    return end - (*beg);
-	} else {
-	    *beg = next;
+		/* now (*beg) points to beginning of zero bits window,
+		 * (end) points to one bit after the window end */
+		if (end - *beg >= min) {	/* it seems we have found window of proper size */
+			int i;
+			reiserfs_prepare_for_journal(s, bi->bh, 1);
+			/* try to set all blocks used checking are they still free */
+			for (i = *beg; i < end; i++) {
+				/* It seems that we should not check in journal again. */
+				if (reiserfs_test_and_set_le_bit
+				    (i, bi->bh->b_data)) {
+					/* bit was set by another process
+					 * while we slept in prepare_for_journal() */
+					PROC_INFO_INC(s, scan_bitmap.stolen);
+					if (i >= *beg + min) {	/* we can continue with smaller set of allocated blocks,
+								 * if length of this set is more or equal to `min' */
+						end = i;
+						break;
+					}
+					/* otherwise we clear all bit were set ... */
+					while (--i >= *beg)
+						reiserfs_test_and_clear_le_bit
+						    (i, bi->bh->b_data);
+					reiserfs_restore_prepared_buffer(s,
+									 bi->
+									 bh);
+					*beg = org;
+					/* ... and search again in current block from beginning */
+					goto cont;
+				}
+			}
+			bi->free_count -= (end - *beg);
+			journal_mark_dirty(th, s, bi->bh);
+
+			/* free block count calculation */
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			PUT_SB_FREE_BLOCKS(s, SB_FREE_BLOCKS(s) - (end - *beg));
+			journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+
+			return end - (*beg);
+		} else {
+			*beg = next;
+		}
 	}
-    }
 }
 
-static int bmap_hash_id(struct super_block *s, u32 id) {
-    char * hash_in = NULL;
-    unsigned long hash;
-    unsigned bm;
-
-    if (id <= 2) {
-	bm = 1;
-    } else {
-        hash_in = (char *)(&id);
-        hash = keyed_hash(hash_in, 4);
-	bm = hash % SB_BMAP_NR(s);
-	if (!bm)
-	    bm = 1;
-    }
-    /* this can only be true when SB_BMAP_NR = 1 */
-    if (bm >= SB_BMAP_NR(s))
-    	bm = 0;
-    return bm;
+static int bmap_hash_id(struct super_block *s, u32 id)
+{
+	char *hash_in = NULL;
+	unsigned long hash;
+	unsigned bm;
+
+	if (id <= 2) {
+		bm = 1;
+	} else {
+		hash_in = (char *)(&id);
+		hash = keyed_hash(hash_in, 4);
+		bm = hash % SB_BMAP_NR(s);
+		if (!bm)
+			bm = 1;
+	}
+	/* this can only be true when SB_BMAP_NR = 1 */
+	if (bm >= SB_BMAP_NR(s))
+		bm = 0;
+	return bm;
 }
 
 /*
  * hashes the id and then returns > 0 if the block group for the
  * corresponding hash is full
  */
-static inline int block_group_used(struct super_block *s, u32 id) {
-    int bm;
-    bm = bmap_hash_id(s, id);
-    if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100) ) {
-        return 0;
-    }
-    return 1;
+static inline int block_group_used(struct super_block *s, u32 id)
+{
+	int bm;
+	bm = bmap_hash_id(s, id);
+	if (SB_AP_BITMAP(s)[bm].free_count > ((s->s_blocksize << 3) * 60 / 100)) {
+		return 0;
+	}
+	return 1;
 }
 
 /*
  * the packing is returned in disk byte order
  */
-__le32 reiserfs_choose_packing(struct inode *dir)
+__le32 reiserfs_choose_packing(struct inode * dir)
 {
-    __le32 packing;
-    if (TEST_OPTION(packing_groups, dir->i_sb)) {
-	u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
-	/*
-	 * some versions of reiserfsck expect packing locality 1 to be
-	 * special
-	 */
-	if (parent_dir == 1 || block_group_used(dir->i_sb,parent_dir))
-            packing = INODE_PKEY(dir)->k_objectid;
-        else
-            packing = INODE_PKEY(dir)->k_dir_id;
-    } else
-        packing = INODE_PKEY(dir)->k_objectid;
-    return packing;
+	__le32 packing;
+	if (TEST_OPTION(packing_groups, dir->i_sb)) {
+		u32 parent_dir = le32_to_cpu(INODE_PKEY(dir)->k_dir_id);
+		/*
+		 * some versions of reiserfsck expect packing locality 1 to be
+		 * special
+		 */
+		if (parent_dir == 1 || block_group_used(dir->i_sb, parent_dir))
+			packing = INODE_PKEY(dir)->k_objectid;
+		else
+			packing = INODE_PKEY(dir)->k_dir_id;
+	} else
+		packing = INODE_PKEY(dir)->k_objectid;
+	return packing;
 }
-  
+
 /* Tries to find contiguous zero bit window (given size) in given region of
  * bitmap and place new blocks there. Returns number of allocated blocks. */
-static int scan_bitmap (struct reiserfs_transaction_handle *th,
-			b_blocknr_t *start, b_blocknr_t finish,
-			int min, int max, int unfm, unsigned long file_block)
+static int scan_bitmap(struct reiserfs_transaction_handle *th,
+		       b_blocknr_t * start, b_blocknr_t finish,
+		       int min, int max, int unfm, unsigned long file_block)
 {
-    int nr_allocated=0;
-    struct super_block * s = th->t_super;
-    /* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
-     * - Hans, it is not a block number - Zam. */
-
-    int bm, off;
-    int end_bm, end_off;
-    int off_max = s->s_blocksize << 3;
-
-    BUG_ON (!th->t_trans_id);
-
-    PROC_INFO_INC( s, scan_bitmap.call ); 
-    if ( SB_FREE_BLOCKS(s) <= 0)
-	return 0; // No point in looking for more free blocks
-
-    get_bit_address (s, *start, &bm, &off);
-    get_bit_address (s, finish, &end_bm, &end_off);
-    if (bm > SB_BMAP_NR(s))
-        return 0;
-    if (end_bm > SB_BMAP_NR(s))
-        end_bm = SB_BMAP_NR(s);
-
-    /* When the bitmap is more than 10% free, anyone can allocate.
-     * When it's less than 10% free, only files that already use the
-     * bitmap are allowed. Once we pass 80% full, this restriction
-     * is lifted.
-     *
-     * We do this so that files that grow later still have space close to
-     * their original allocation. This improves locality, and presumably
-     * performance as a result.
-     *
-     * This is only an allocation policy and does not make up for getting a
-     * bad hint. Decent hinting must be implemented for this to work well.
-     */
-    if ( TEST_OPTION(skip_busy, s) && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s)/20 ) {
-	for (;bm < end_bm; bm++, off = 0) {
-	    if ( ( off && (!unfm || (file_block != 0))) || SB_AP_BITMAP(s)[bm].free_count > (s->s_blocksize << 3) / 10 )
-		nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
-	    if (nr_allocated)
-		goto ret;
-        }
-	/* we know from above that start is a reasonable number */
-	get_bit_address (s, *start, &bm, &off);
-    }
-
-    for (;bm < end_bm; bm++, off = 0) {
-	nr_allocated = scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
-	if (nr_allocated)
-	    goto ret;
-    }
-
-    nr_allocated = scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
-  
- ret:
-    *start = bm * off_max + off;
-    return nr_allocated;
+	int nr_allocated = 0;
+	struct super_block *s = th->t_super;
+	/* find every bm and bmap and bmap_nr in this file, and change them all to bitmap_blocknr
+	 * - Hans, it is not a block number - Zam. */
+
+	int bm, off;
+	int end_bm, end_off;
+	int off_max = s->s_blocksize << 3;
+
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(s, scan_bitmap.call);
+	if (SB_FREE_BLOCKS(s) <= 0)
+		return 0;	// No point in looking for more free blocks
+
+	get_bit_address(s, *start, &bm, &off);
+	get_bit_address(s, finish, &end_bm, &end_off);
+	if (bm > SB_BMAP_NR(s))
+		return 0;
+	if (end_bm > SB_BMAP_NR(s))
+		end_bm = SB_BMAP_NR(s);
+
+	/* When the bitmap is more than 10% free, anyone can allocate.
+	 * When it's less than 10% free, only files that already use the
+	 * bitmap are allowed. Once we pass 80% full, this restriction
+	 * is lifted.
+	 *
+	 * We do this so that files that grow later still have space close to
+	 * their original allocation. This improves locality, and presumably
+	 * performance as a result.
+	 *
+	 * This is only an allocation policy and does not make up for getting a
+	 * bad hint. Decent hinting must be implemented for this to work well.
+	 */
+	if (TEST_OPTION(skip_busy, s)
+	    && SB_FREE_BLOCKS(s) > SB_BLOCK_COUNT(s) / 20) {
+		for (; bm < end_bm; bm++, off = 0) {
+			if ((off && (!unfm || (file_block != 0)))
+			    || SB_AP_BITMAP(s)[bm].free_count >
+			    (s->s_blocksize << 3) / 10)
+				nr_allocated =
+				    scan_bitmap_block(th, bm, &off, off_max,
+						      min, max, unfm);
+			if (nr_allocated)
+				goto ret;
+		}
+		/* we know from above that start is a reasonable number */
+		get_bit_address(s, *start, &bm, &off);
+	}
+
+	for (; bm < end_bm; bm++, off = 0) {
+		nr_allocated =
+		    scan_bitmap_block(th, bm, &off, off_max, min, max, unfm);
+		if (nr_allocated)
+			goto ret;
+	}
+
+	nr_allocated =
+	    scan_bitmap_block(th, bm, &off, end_off + 1, min, max, unfm);
+
+      ret:
+	*start = bm * off_max + off;
+	return nr_allocated;
 
 }
 
-static void _reiserfs_free_block (struct reiserfs_transaction_handle *th,
-				  struct inode *inode, b_blocknr_t block,
-				  int for_unformatted)
+static void _reiserfs_free_block(struct reiserfs_transaction_handle *th,
+				 struct inode *inode, b_blocknr_t block,
+				 int for_unformatted)
 {
-    struct super_block * s = th->t_super;
-    struct reiserfs_super_block * rs;
-    struct buffer_head * sbh;
-    struct reiserfs_bitmap_info *apbi;
-    int nr, offset;
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs;
+	struct buffer_head *sbh;
+	struct reiserfs_bitmap_info *apbi;
+	int nr, offset;
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    PROC_INFO_INC( s, free_block );
+	PROC_INFO_INC(s, free_block);
 
-    rs = SB_DISK_SUPER_BLOCK (s);
-    sbh = SB_BUFFER_WITH_SB (s);
-    apbi = SB_AP_BITMAP(s);
+	rs = SB_DISK_SUPER_BLOCK(s);
+	sbh = SB_BUFFER_WITH_SB(s);
+	apbi = SB_AP_BITMAP(s);
 
-    get_bit_address (s, block, &nr, &offset);
+	get_bit_address(s, block, &nr, &offset);
 
-    if (nr >= sb_bmap_nr (rs)) {
-	reiserfs_warning (s, "vs-4075: reiserfs_free_block: "
-			  "block %lu is out of range on %s",
-			  block, reiserfs_bdevname (s));
-	return;
-    }
-
-    reiserfs_prepare_for_journal(s, apbi[nr].bh, 1 ) ;
-
-    /* clear bit for the given block in bit map */
-    if (!reiserfs_test_and_clear_le_bit (offset, apbi[nr].bh->b_data)) {
-	reiserfs_warning (s, "vs-4080: reiserfs_free_block: "
-			  "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
-			  reiserfs_bdevname (s), block);
-    }
-    apbi[nr].free_count ++;
-    journal_mark_dirty (th, s, apbi[nr].bh);
-
-    reiserfs_prepare_for_journal(s, sbh, 1) ;
-    /* update super block */
-    set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 );
-
-    journal_mark_dirty (th, s, sbh);
-    if (for_unformatted)
-        DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
+	if (nr >= sb_bmap_nr(rs)) {
+		reiserfs_warning(s, "vs-4075: reiserfs_free_block: "
+				 "block %lu is out of range on %s",
+				 block, reiserfs_bdevname(s));
+		return;
+	}
+
+	reiserfs_prepare_for_journal(s, apbi[nr].bh, 1);
+
+	/* clear bit for the given block in bit map */
+	if (!reiserfs_test_and_clear_le_bit(offset, apbi[nr].bh->b_data)) {
+		reiserfs_warning(s, "vs-4080: reiserfs_free_block: "
+				 "free_block (%s:%lu)[dev:blocknr]: bit already cleared",
+				 reiserfs_bdevname(s), block);
+	}
+	apbi[nr].free_count++;
+	journal_mark_dirty(th, s, apbi[nr].bh);
+
+	reiserfs_prepare_for_journal(s, sbh, 1);
+	/* update super block */
+	set_sb_free_blocks(rs, sb_free_blocks(rs) + 1);
+
+	journal_mark_dirty(th, s, sbh);
+	if (for_unformatted)
+		DQUOT_FREE_BLOCK_NODIRTY(inode, 1);
 }
 
-void reiserfs_free_block (struct reiserfs_transaction_handle *th, 
-			  struct inode *inode, b_blocknr_t block,
-			  int for_unformatted)
+void reiserfs_free_block(struct reiserfs_transaction_handle *th,
+			 struct inode *inode, b_blocknr_t block,
+			 int for_unformatted)
 {
-    struct super_block * s = th->t_super;
+	struct super_block *s = th->t_super;
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
-    RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block");
-    /* mark it before we clear it, just in case */
-    journal_mark_freed(th, s, block) ;
-    _reiserfs_free_block(th, inode, block, for_unformatted) ;
+	RFALSE(!s, "vs-4061: trying to free block on nonexistent device");
+	RFALSE(is_reusable(s, block, 1) == 0,
+	       "vs-4071: can not free such block");
+	/* mark it before we clear it, just in case */
+	journal_mark_freed(th, s, block);
+	_reiserfs_free_block(th, inode, block, for_unformatted);
 }
 
 /* preallocated blocks don't need to be run through journal_mark_freed */
-static void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th,
-			  struct inode *inode, b_blocknr_t block) {
-    RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device");
-    RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block");
-    BUG_ON (!th->t_trans_id);
-    _reiserfs_free_block(th, inode, block, 1) ;
+static void reiserfs_free_prealloc_block(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, b_blocknr_t block)
+{
+	RFALSE(!th->t_super,
+	       "vs-4060: trying to free block on nonexistent device");
+	RFALSE(is_reusable(th->t_super, block, 1) == 0,
+	       "vs-4070: can not free such block");
+	BUG_ON(!th->t_trans_id);
+	_reiserfs_free_block(th, inode, block, 1);
 }
 
-static void __discard_prealloc (struct reiserfs_transaction_handle * th,
-				struct reiserfs_inode_info *ei)
+static void __discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct reiserfs_inode_info *ei)
 {
-    unsigned long save = ei->i_prealloc_block ;
-    int dirty = 0;
-    struct inode *inode = &ei->vfs_inode;
-    BUG_ON (!th->t_trans_id);
+	unsigned long save = ei->i_prealloc_block;
+	int dirty = 0;
+	struct inode *inode = &ei->vfs_inode;
+	BUG_ON(!th->t_trans_id);
 #ifdef CONFIG_REISERFS_CHECK
-    if (ei->i_prealloc_count < 0)
-	reiserfs_warning (th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", __FUNCTION__ );
+	if (ei->i_prealloc_count < 0)
+		reiserfs_warning(th->t_super,
+				 "zam-4001:%s: inode has negative prealloc blocks count.",
+				 __FUNCTION__);
 #endif
-    while (ei->i_prealloc_count > 0) {
-	reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
-	ei->i_prealloc_block++;
-	ei->i_prealloc_count --;
-	dirty = 1;
-    }
-    if (dirty)
-    	reiserfs_update_sd(th, inode);
-    ei->i_prealloc_block = save;
-    list_del_init(&(ei->i_prealloc_list));
+	while (ei->i_prealloc_count > 0) {
+		reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block);
+		ei->i_prealloc_block++;
+		ei->i_prealloc_count--;
+		dirty = 1;
+	}
+	if (dirty)
+		reiserfs_update_sd(th, inode);
+	ei->i_prealloc_block = save;
+	list_del_init(&(ei->i_prealloc_list));
 }
 
 /* FIXME: It should be inline function */
-void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, 
-				struct inode *inode)
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
 {
-    struct reiserfs_inode_info *ei = REISERFS_I(inode);
-    BUG_ON (!th->t_trans_id);
-    if (ei->i_prealloc_count)
-	__discard_prealloc(th, ei);
+	struct reiserfs_inode_info *ei = REISERFS_I(inode);
+	BUG_ON(!th->t_trans_id);
+	if (ei->i_prealloc_count)
+		__discard_prealloc(th, ei);
 }
 
-void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th)
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th)
 {
-    struct list_head * plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
+	struct list_head *plist = &SB_JOURNAL(th->t_super)->j_prealloc_list;
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    while (!list_empty(plist)) {
-	struct reiserfs_inode_info *ei;
-	ei = list_entry(plist->next, struct reiserfs_inode_info, i_prealloc_list);
+	while (!list_empty(plist)) {
+		struct reiserfs_inode_info *ei;
+		ei = list_entry(plist->next, struct reiserfs_inode_info,
+				i_prealloc_list);
 #ifdef CONFIG_REISERFS_CHECK
-	if (!ei->i_prealloc_count) {
-	    reiserfs_warning (th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", __FUNCTION__);
-	}
+		if (!ei->i_prealloc_count) {
+			reiserfs_warning(th->t_super,
+					 "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.",
+					 __FUNCTION__);
+		}
 #endif
-	__discard_prealloc(th, ei);
-    }
+		__discard_prealloc(th, ei);
+	}
 }
 
-void reiserfs_init_alloc_options (struct super_block *s)
+void reiserfs_init_alloc_options(struct super_block *s)
 {
-    set_bit (_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
-    set_bit (_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
-    set_bit (_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_skip_busy, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_dirid_groups, &SB_ALLOC_OPTS(s));
+	set_bit(_ALLOC_packing_groups, &SB_ALLOC_OPTS(s));
 }
 
 /* block allocator related options are parsed here */
-int reiserfs_parse_alloc_options(struct super_block * s, char * options)
+int reiserfs_parse_alloc_options(struct super_block *s, char *options)
 {
-    char * this_char, * value;
-
-    REISERFS_SB(s)->s_alloc_options.bits = 0; /* clear default settings */
-
-    while ( (this_char = strsep (&options, ":")) != NULL ) {
-	if ((value = strchr (this_char, '=')) != NULL)
-	    *value++ = 0;
-
-	if (!strcmp(this_char, "concentrating_formatted_nodes")) {
-	    int temp;
-	    SET_OPTION(concentrating_formatted_nodes);
-	    temp = (value && *value) ? simple_strtoul (value, &value, 0) : 10;
-	    if (temp <= 0 || temp > 100) {
-		REISERFS_SB(s)->s_alloc_options.border = 10;
-	    } else {
-		REISERFS_SB(s)->s_alloc_options.border = 100 / temp;
-	   }
-	    continue;
-	}
-	if (!strcmp(this_char, "displacing_large_files")) {
-	    SET_OPTION(displacing_large_files);
-	    REISERFS_SB(s)->s_alloc_options.large_file_size =
-		(value && *value) ? simple_strtoul (value, &value, 0) : 16;
-	    continue;
-	}
-	if (!strcmp(this_char, "displacing_new_packing_localities")) {
-	    SET_OPTION(displacing_new_packing_localities);
-	    continue;
-	};
-
-	if (!strcmp(this_char, "old_hashed_relocation")) {
-	    SET_OPTION(old_hashed_relocation);
-	    continue;
-	}
+	char *this_char, *value;
+
+	REISERFS_SB(s)->s_alloc_options.bits = 0;	/* clear default settings */
+
+	while ((this_char = strsep(&options, ":")) != NULL) {
+		if ((value = strchr(this_char, '=')) != NULL)
+			*value++ = 0;
+
+		if (!strcmp(this_char, "concentrating_formatted_nodes")) {
+			int temp;
+			SET_OPTION(concentrating_formatted_nodes);
+			temp = (value
+				&& *value) ? simple_strtoul(value, &value,
+							    0) : 10;
+			if (temp <= 0 || temp > 100) {
+				REISERFS_SB(s)->s_alloc_options.border = 10;
+			} else {
+				REISERFS_SB(s)->s_alloc_options.border =
+				    100 / temp;
+			}
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_large_files")) {
+			SET_OPTION(displacing_large_files);
+			REISERFS_SB(s)->s_alloc_options.large_file_size =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 16;
+			continue;
+		}
+		if (!strcmp(this_char, "displacing_new_packing_localities")) {
+			SET_OPTION(displacing_new_packing_localities);
+			continue;
+		};
+
+		if (!strcmp(this_char, "old_hashed_relocation")) {
+			SET_OPTION(old_hashed_relocation);
+			continue;
+		}
 
-	if (!strcmp(this_char, "new_hashed_relocation")) {
-	    SET_OPTION(new_hashed_relocation);
-	    continue;
-	}
+		if (!strcmp(this_char, "new_hashed_relocation")) {
+			SET_OPTION(new_hashed_relocation);
+			continue;
+		}
 
-        if (!strcmp(this_char, "dirid_groups")) {
-	    SET_OPTION(dirid_groups);
-	    continue;
-        }
-        if (!strcmp(this_char, "oid_groups")) {
-	    SET_OPTION(oid_groups);
-	    continue;
-        }
-        if (!strcmp(this_char, "packing_groups")) {
-	    SET_OPTION(packing_groups);
-	    continue;
-        }
-	if (!strcmp(this_char, "hashed_formatted_nodes")) {
-	    SET_OPTION(hashed_formatted_nodes);
-	    continue;
-	}
+		if (!strcmp(this_char, "dirid_groups")) {
+			SET_OPTION(dirid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "oid_groups")) {
+			SET_OPTION(oid_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "packing_groups")) {
+			SET_OPTION(packing_groups);
+			continue;
+		}
+		if (!strcmp(this_char, "hashed_formatted_nodes")) {
+			SET_OPTION(hashed_formatted_nodes);
+			continue;
+		}
 
-	if (!strcmp(this_char, "skip_busy")) {
-	    SET_OPTION(skip_busy);
-	    continue;
-	}
+		if (!strcmp(this_char, "skip_busy")) {
+			SET_OPTION(skip_busy);
+			continue;
+		}
 
-	if (!strcmp(this_char, "hundredth_slices")) {
-	    SET_OPTION(hundredth_slices);
-	    continue;
-	}
+		if (!strcmp(this_char, "hundredth_slices")) {
+			SET_OPTION(hundredth_slices);
+			continue;
+		}
 
-	if (!strcmp(this_char, "old_way")) {
-	    SET_OPTION(old_way);
-	    continue;
-	}
+		if (!strcmp(this_char, "old_way")) {
+			SET_OPTION(old_way);
+			continue;
+		}
 
-	if (!strcmp(this_char, "displace_based_on_dirid")) {
-	    SET_OPTION(displace_based_on_dirid);
-	    continue;
-	}
+		if (!strcmp(this_char, "displace_based_on_dirid")) {
+			SET_OPTION(displace_based_on_dirid);
+			continue;
+		}
 
-	if (!strcmp(this_char, "preallocmin")) {
-	    REISERFS_SB(s)->s_alloc_options.preallocmin =
-		(value && *value) ? simple_strtoul (value, &value, 0) : 4;
-	    continue;
-	}
+		if (!strcmp(this_char, "preallocmin")) {
+			REISERFS_SB(s)->s_alloc_options.preallocmin =
+			    (value
+			     && *value) ? simple_strtoul(value, &value, 0) : 4;
+			continue;
+		}
+
+		if (!strcmp(this_char, "preallocsize")) {
+			REISERFS_SB(s)->s_alloc_options.preallocsize =
+			    (value
+			     && *value) ? simple_strtoul(value, &value,
+							 0) :
+			    PREALLOCATION_SIZE;
+			continue;
+		}
 
-	if (!strcmp(this_char, "preallocsize")) {
-	    REISERFS_SB(s)->s_alloc_options.preallocsize =
-		(value && *value) ? simple_strtoul (value, &value, 0) : PREALLOCATION_SIZE;
-	    continue;
+		reiserfs_warning(s, "zam-4001: %s : unknown option - %s",
+				 __FUNCTION__, this_char);
+		return 1;
 	}
 
-	reiserfs_warning (s, "zam-4001: %s : unknown option - %s",
-			  __FUNCTION__ , this_char);
-	return 1;
-      }
-  
-    reiserfs_warning (s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
-    return 0;
+	reiserfs_warning(s, "allocator options = [%08x]\n", SB_ALLOC_OPTS(s));
+	return 0;
 }
-  
-static inline void new_hashed_relocation (reiserfs_blocknr_hint_t * hint)
+
+static inline void new_hashed_relocation(reiserfs_blocknr_hint_t * hint)
 {
-    char * hash_in;
-    if (hint->formatted_node) {
-	    hash_in = (char*)&hint->key.k_dir_id;
-    } else {
-	if (!hint->inode) {
-	    //hint->search_start = hint->beg;
-	    hash_in = (char*)&hint->key.k_dir_id;
-	} else 
-	    if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-	    else
-		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
-      }
+	char *hash_in;
+	if (hint->formatted_node) {
+		hash_in = (char *)&hint->key.k_dir_id;
+	} else {
+		if (!hint->inode) {
+			//hint->search_start = hint->beg;
+			hash_in = (char *)&hint->key.k_dir_id;
+		} else
+		    if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+			hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+		else
+			hash_in =
+			    (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+	}
 
-    hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
 }
 
 /*
  * Relocation based on dirid, hashing them into a given bitmap block
  * files. Formatted nodes are unaffected, a seperate policy covers them
  */
-static void
-dirid_groups (reiserfs_blocknr_hint_t *hint)
+static void dirid_groups(reiserfs_blocknr_hint_t * hint)
 {
-    unsigned long hash;
-    __u32 dirid = 0;
-    int bm = 0;
-    struct super_block *sb = hint->th->t_super;
-    if (hint->inode)
-	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-    else if (hint->formatted_node)
-        dirid = hint->key.k_dir_id;
-
-    if (dirid) {
-	bm = bmap_hash_id(sb, dirid);
-	hash = bm * (sb->s_blocksize << 3);
-	/* give a portion of the block group to metadata */
+	unsigned long hash;
+	__u32 dirid = 0;
+	int bm = 0;
+	struct super_block *sb = hint->th->t_super;
 	if (hint->inode)
-	    hash += sb->s_blocksize/2;
-	hint->search_start = hash;
-    }
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+	else if (hint->formatted_node)
+		dirid = hint->key.k_dir_id;
+
+	if (dirid) {
+		bm = bmap_hash_id(sb, dirid);
+		hash = bm * (sb->s_blocksize << 3);
+		/* give a portion of the block group to metadata */
+		if (hint->inode)
+			hash += sb->s_blocksize / 2;
+		hint->search_start = hash;
+	}
 }
 
 /*
  * Relocation based on oid, hashing them into a given bitmap block
  * files. Formatted nodes are unaffected, a seperate policy covers them
  */
-static void
-oid_groups (reiserfs_blocknr_hint_t *hint)
+static void oid_groups(reiserfs_blocknr_hint_t * hint)
 {
-    if (hint->inode) {
-	unsigned long hash;
-	__u32 oid;
-	__u32 dirid;
-	int bm;
-
-	dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
-
-	/* keep the root dir and it's first set of subdirs close to
-	 * the start of the disk
-	 */
-	if (dirid <= 2)
-	    hash = (hint->inode->i_sb->s_blocksize << 3);
-	else {
-	    oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
-	    bm = bmap_hash_id(hint->inode->i_sb, oid);
-	    hash = bm * (hint->inode->i_sb->s_blocksize << 3);
+	if (hint->inode) {
+		unsigned long hash;
+		__u32 oid;
+		__u32 dirid;
+		int bm;
+
+		dirid = le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id);
+
+		/* keep the root dir and it's first set of subdirs close to
+		 * the start of the disk
+		 */
+		if (dirid <= 2)
+			hash = (hint->inode->i_sb->s_blocksize << 3);
+		else {
+			oid = le32_to_cpu(INODE_PKEY(hint->inode)->k_objectid);
+			bm = bmap_hash_id(hint->inode->i_sb, oid);
+			hash = bm * (hint->inode->i_sb->s_blocksize << 3);
+		}
+		hint->search_start = hash;
 	}
-	hint->search_start = hash;
-    }
 }
 
 /* returns 1 if it finds an indirect item and gets valid hint info
  * from it, otherwise 0
  */
-static int get_left_neighbor(reiserfs_blocknr_hint_t *hint)
+static int get_left_neighbor(reiserfs_blocknr_hint_t * hint)
 {
-    struct path * path;
-    struct buffer_head * bh;
-    struct item_head * ih;
-    int pos_in_item;
-    __le32 * item;
-    int ret = 0;
-
-    if (!hint->path)		/* reiserfs code can call this function w/o pointer to path
+	struct path *path;
+	struct buffer_head *bh;
+	struct item_head *ih;
+	int pos_in_item;
+	__le32 *item;
+	int ret = 0;
+
+	if (!hint->path)	/* reiserfs code can call this function w/o pointer to path
 				 * structure supplied; then we rely on supplied search_start */
-	return 0;
-
-    path = hint->path;
-    bh = get_last_bh(path);
-    RFALSE( !bh, "green-4002: Illegal path specified to get_left_neighbor");
-    ih = get_ih(path);
-    pos_in_item = path->pos_in_item;
-    item = get_item (path);
-
-    hint->search_start = bh->b_blocknr;
-
-    if (!hint->formatted_node && is_indirect_le_ih (ih)) {
-	/* for indirect item: go to left and look for the first non-hole entry
-	   in the indirect item */
-	if (pos_in_item == I_UNFM_NUM (ih))
-	    pos_in_item--;
-//	    pos_in_item = I_UNFM_NUM (ih) - 1;
-	while (pos_in_item >= 0) {
-	    int t=get_block_num(item,pos_in_item);
-	    if (t) {
-		hint->search_start = t;
-		ret = 1;
-		break;
-	    }
-	    pos_in_item --;
+		return 0;
+
+	path = hint->path;
+	bh = get_last_bh(path);
+	RFALSE(!bh, "green-4002: Illegal path specified to get_left_neighbor");
+	ih = get_ih(path);
+	pos_in_item = path->pos_in_item;
+	item = get_item(path);
+
+	hint->search_start = bh->b_blocknr;
+
+	if (!hint->formatted_node && is_indirect_le_ih(ih)) {
+		/* for indirect item: go to left and look for the first non-hole entry
+		   in the indirect item */
+		if (pos_in_item == I_UNFM_NUM(ih))
+			pos_in_item--;
+//          pos_in_item = I_UNFM_NUM (ih) - 1;
+		while (pos_in_item >= 0) {
+			int t = get_block_num(item, pos_in_item);
+			if (t) {
+				hint->search_start = t;
+				ret = 1;
+				break;
+			}
+			pos_in_item--;
+		}
 	}
-    }
 
-    /* does result value fit into specified region? */
-    return ret;
+	/* does result value fit into specified region? */
+	return ret;
 }
 
 /* should be, if formatted node, then try to put on first part of the device
    specified as number of percent with mount option device, else try to put
    on last of device.  This is not to say it is good code to do so,
    but the effect should be measured.  */
-static inline void set_border_in_hint(struct super_block *s, reiserfs_blocknr_hint_t *hint)
+static inline void set_border_in_hint(struct super_block *s,
+				      reiserfs_blocknr_hint_t * hint)
 {
-    b_blocknr_t border = SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
+	b_blocknr_t border =
+	    SB_BLOCK_COUNT(s) / REISERFS_SB(s)->s_alloc_options.border;
 
-    if (hint->formatted_node)
-	hint->end = border - 1;
-    else
-	hint->beg = border;
+	if (hint->formatted_node)
+		hint->end = border - 1;
+	else
+		hint->beg = border;
 }
 
-static inline void displace_large_file(reiserfs_blocknr_hint_t *hint)
+static inline void displace_large_file(reiserfs_blocknr_hint_t * hint)
 {
-    if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-	hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id), 4) % (hint->end - hint->beg);
-    else
-	hint->search_start = hint->beg + keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid), 4) % (hint->end - hint->beg);
+	if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_dir_id),
+			       4) % (hint->end - hint->beg);
+	else
+		hint->search_start =
+		    hint->beg +
+		    keyed_hash((char *)(&INODE_PKEY(hint->inode)->k_objectid),
+			       4) % (hint->end - hint->beg);
 }
 
-static inline void hash_formatted_node(reiserfs_blocknr_hint_t *hint)
+static inline void hash_formatted_node(reiserfs_blocknr_hint_t * hint)
 {
-   char * hash_in;
+	char *hash_in;
 
-   if (!hint->inode)
-	hash_in = (char*)&hint->key.k_dir_id;
-    else if ( TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
-	hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
-    else
-	hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
+	if (!hint->inode)
+		hash_in = (char *)&hint->key.k_dir_id;
+	else if (TEST_OPTION(displace_based_on_dirid, hint->th->t_super))
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_dir_id);
+	else
+		hash_in = (char *)(&INODE_PKEY(hint->inode)->k_objectid);
 
-	hint->search_start = hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
+	hint->search_start =
+	    hint->beg + keyed_hash(hash_in, 4) % (hint->end - hint->beg);
 }
 
-static inline int this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *hint)
+static inline int
+this_blocknr_allocation_would_make_it_a_large_file(reiserfs_blocknr_hint_t *
+						   hint)
 {
-    return hint->block == REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
+	return hint->block ==
+	    REISERFS_SB(hint->th->t_super)->s_alloc_options.large_file_size;
 }
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-static inline void displace_new_packing_locality (reiserfs_blocknr_hint_t *hint)
+static inline void displace_new_packing_locality(reiserfs_blocknr_hint_t * hint)
 {
-    struct in_core_key * key = &hint->key;
+	struct in_core_key *key = &hint->key;
 
-    hint->th->displace_new_blocks = 0;
-    hint->search_start = hint->beg + keyed_hash((char*)(&key->k_objectid),4) % (hint->end - hint->beg);
+	hint->th->displace_new_blocks = 0;
+	hint->search_start =
+	    hint->beg + keyed_hash((char *)(&key->k_objectid),
+				   4) % (hint->end - hint->beg);
 }
-  #endif
+#endif
 
-static inline int old_hashed_relocation (reiserfs_blocknr_hint_t * hint)
+static inline int old_hashed_relocation(reiserfs_blocknr_hint_t * hint)
 {
-    b_blocknr_t border;
-    u32 hash_in;
-    
-    if (hint->formatted_node || hint->inode == NULL) {
-	return 0;
-      }
+	b_blocknr_t border;
+	u32 hash_in;
 
-    hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
-    border = hint->beg + (u32) keyed_hash(((char *) (&hash_in)), 4) % (hint->end - hint->beg - 1);
-    if (border > hint->search_start)
-	hint->search_start = border;
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
 
-    return 1;
-  }
-  
-static inline int old_way (reiserfs_blocknr_hint_t * hint)
-{
-    b_blocknr_t border;
-    
-    if (hint->formatted_node || hint->inode == NULL) {
-	return 0;
-    }
-  
-      border = hint->beg + le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end  - hint->beg);
-    if (border > hint->search_start)
-	hint->search_start = border;
+	hash_in = le32_to_cpu((INODE_PKEY(hint->inode))->k_dir_id);
+	border =
+	    hint->beg + (u32) keyed_hash(((char *)(&hash_in)),
+					 4) % (hint->end - hint->beg - 1);
+	if (border > hint->search_start)
+		hint->search_start = border;
 
-    return 1;
+	return 1;
 }
 
-static inline void hundredth_slices (reiserfs_blocknr_hint_t * hint)
+static inline int old_way(reiserfs_blocknr_hint_t * hint)
 {
-    struct in_core_key * key = &hint->key;
-    b_blocknr_t slice_start;
+	b_blocknr_t border;
+
+	if (hint->formatted_node || hint->inode == NULL) {
+		return 0;
+	}
+
+	border =
+	    hint->beg +
+	    le32_to_cpu(INODE_PKEY(hint->inode)->k_dir_id) % (hint->end -
+							      hint->beg);
+	if (border > hint->search_start)
+		hint->search_start = border;
 
-    slice_start = (keyed_hash((char*)(&key->k_dir_id),4) % 100) * (hint->end / 100);
-    if ( slice_start > hint->search_start || slice_start + (hint->end / 100) <= hint->search_start) {
-	hint->search_start = slice_start;
-    }
+	return 1;
+}
+
+static inline void hundredth_slices(reiserfs_blocknr_hint_t * hint)
+{
+	struct in_core_key *key = &hint->key;
+	b_blocknr_t slice_start;
+
+	slice_start =
+	    (keyed_hash((char *)(&key->k_dir_id), 4) % 100) * (hint->end / 100);
+	if (slice_start > hint->search_start
+	    || slice_start + (hint->end / 100) <= hint->search_start) {
+		hint->search_start = slice_start;
+	}
 }
-  
-static void determine_search_start(reiserfs_blocknr_hint_t *hint,
-					  int amount_needed)
+
+static void determine_search_start(reiserfs_blocknr_hint_t * hint,
+				   int amount_needed)
 {
-    struct super_block *s = hint->th->t_super;
-    int unfm_hint;
+	struct super_block *s = hint->th->t_super;
+	int unfm_hint;
 
-    hint->beg = 0;
-    hint->end = SB_BLOCK_COUNT(s) - 1;
+	hint->beg = 0;
+	hint->end = SB_BLOCK_COUNT(s) - 1;
 
-    /* This is former border algorithm. Now with tunable border offset */
-    if (concentrating_formatted_nodes(s))
-	set_border_in_hint(s, hint);
+	/* This is former border algorithm. Now with tunable border offset */
+	if (concentrating_formatted_nodes(s))
+		set_border_in_hint(s, hint);
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    /* whenever we create a new directory, we displace it.  At first we will
-       hash for location, later we might look for a moderately empty place for
-       it */
-    if (displacing_new_packing_localities(s)
-	&& hint->th->displace_new_blocks) {
-	displace_new_packing_locality(hint);
-
-	/* we do not continue determine_search_start,
-	 * if new packing locality is being displaced */
-	return;
-    }				      
+	/* whenever we create a new directory, we displace it.  At first we will
+	   hash for location, later we might look for a moderately empty place for
+	   it */
+	if (displacing_new_packing_localities(s)
+	    && hint->th->displace_new_blocks) {
+		displace_new_packing_locality(hint);
+
+		/* we do not continue determine_search_start,
+		 * if new packing locality is being displaced */
+		return;
+	}
 #endif
-  
-    /* all persons should feel encouraged to add more special cases here and
-     * test them */
 
-    if (displacing_large_files(s) && !hint->formatted_node
-	&& this_blocknr_allocation_would_make_it_a_large_file(hint)) {
-	displace_large_file(hint);
-	return;
-    }
-
-    /* if none of our special cases is relevant, use the left neighbor in the
-       tree order of the new node we are allocating for */
-    if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes,s)) {
-        hash_formatted_node(hint);
-	return;
-    }
+	/* all persons should feel encouraged to add more special cases here and
+	 * test them */
 
-    unfm_hint = get_left_neighbor(hint);
+	if (displacing_large_files(s) && !hint->formatted_node
+	    && this_blocknr_allocation_would_make_it_a_large_file(hint)) {
+		displace_large_file(hint);
+		return;
+	}
 
-    /* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
-       new blocks are displaced based on directory ID. Also, if suggested search_start
-       is less than last preallocated block, we start searching from it, assuming that
-       HDD dataflow is faster in forward direction */
-    if ( TEST_OPTION(old_way, s)) {
-	if (!hint->formatted_node) {
-	    if ( !reiserfs_hashed_relocation(s))
-		old_way(hint);
-	    else if (!reiserfs_no_unhashed_relocation(s))
-		old_hashed_relocation(hint);
+	/* if none of our special cases is relevant, use the left neighbor in the
+	   tree order of the new node we are allocating for */
+	if (hint->formatted_node && TEST_OPTION(hashed_formatted_nodes, s)) {
+		hash_formatted_node(hint);
+		return;
+	}
 
-	    if ( hint->inode && hint->search_start < REISERFS_I(hint->inode)->i_prealloc_block)
-		hint->search_start = REISERFS_I(hint->inode)->i_prealloc_block;
+	unfm_hint = get_left_neighbor(hint);
+
+	/* Mimic old block allocator behaviour, that is if VFS allowed for preallocation,
+	   new blocks are displaced based on directory ID. Also, if suggested search_start
+	   is less than last preallocated block, we start searching from it, assuming that
+	   HDD dataflow is faster in forward direction */
+	if (TEST_OPTION(old_way, s)) {
+		if (!hint->formatted_node) {
+			if (!reiserfs_hashed_relocation(s))
+				old_way(hint);
+			else if (!reiserfs_no_unhashed_relocation(s))
+				old_hashed_relocation(hint);
+
+			if (hint->inode
+			    && hint->search_start <
+			    REISERFS_I(hint->inode)->i_prealloc_block)
+				hint->search_start =
+				    REISERFS_I(hint->inode)->i_prealloc_block;
+		}
+		return;
 	}
-	return;
-    }
 
-    /* This is an approach proposed by Hans */
-    if ( TEST_OPTION(hundredth_slices, s) && ! (displacing_large_files(s) && !hint->formatted_node)) {
-	hundredth_slices(hint);
-	return;
-    }
-
-    /* old_hashed_relocation only works on unformatted */
-    if (!unfm_hint && !hint->formatted_node &&
-        TEST_OPTION(old_hashed_relocation, s))
-    {
-	old_hashed_relocation(hint);
-    }
-    /* new_hashed_relocation works with both formatted/unformatted nodes */
-    if ((!unfm_hint || hint->formatted_node) &&
-        TEST_OPTION(new_hashed_relocation, s))
-    {
-	new_hashed_relocation(hint);
-    }
-    /* dirid grouping works only on unformatted nodes */
-    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups,s))
-    {
-        dirid_groups(hint);
-    }
+	/* This is an approach proposed by Hans */
+	if (TEST_OPTION(hundredth_slices, s)
+	    && !(displacing_large_files(s) && !hint->formatted_node)) {
+		hundredth_slices(hint);
+		return;
+	}
 
+	/* old_hashed_relocation only works on unformatted */
+	if (!unfm_hint && !hint->formatted_node &&
+	    TEST_OPTION(old_hashed_relocation, s)) {
+		old_hashed_relocation(hint);
+	}
+	/* new_hashed_relocation works with both formatted/unformatted nodes */
+	if ((!unfm_hint || hint->formatted_node) &&
+	    TEST_OPTION(new_hashed_relocation, s)) {
+		new_hashed_relocation(hint);
+	}
+	/* dirid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    if (hint->formatted_node && TEST_OPTION(dirid_groups,s))
-    {
-        dirid_groups(hint);
-    }
+	if (hint->formatted_node && TEST_OPTION(dirid_groups, s)) {
+		dirid_groups(hint);
+	}
 #endif
 
-    /* oid grouping works only on unformatted nodes */
-    if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups,s))
-    {
-        oid_groups(hint);
-    }
-    return;
+	/* oid grouping works only on unformatted nodes */
+	if (!unfm_hint && !hint->formatted_node && TEST_OPTION(oid_groups, s)) {
+		oid_groups(hint);
+	}
+	return;
 }
 
 static int determine_prealloc_size(reiserfs_blocknr_hint_t * hint)
 {
-    /* make minimum size a mount option and benchmark both ways */
-    /* we preallocate blocks only for regular files, specific size */
-    /* benchmark preallocating always and see what happens */
-
-    hint->prealloc_size = 0;
-
-    if (!hint->formatted_node && hint->preallocate) {
-	if (S_ISREG(hint->inode->i_mode)
-	    && hint->inode->i_size >= REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocmin * hint->inode->i_sb->s_blocksize)
-	    hint->prealloc_size = REISERFS_SB(hint->th->t_super)->s_alloc_options.preallocsize - 1;
-    }
-    return CARRY_ON;
+	/* make minimum size a mount option and benchmark both ways */
+	/* we preallocate blocks only for regular files, specific size */
+	/* benchmark preallocating always and see what happens */
+
+	hint->prealloc_size = 0;
+
+	if (!hint->formatted_node && hint->preallocate) {
+		if (S_ISREG(hint->inode->i_mode)
+		    && hint->inode->i_size >=
+		    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+		    preallocmin * hint->inode->i_sb->s_blocksize)
+			hint->prealloc_size =
+			    REISERFS_SB(hint->th->t_super)->s_alloc_options.
+			    preallocsize - 1;
+	}
+	return CARRY_ON;
 }
 
 /* XXX I know it could be merged with upper-level function;
    but may be result function would be too complex. */
-static inline int allocate_without_wrapping_disk (reiserfs_blocknr_hint_t * hint,
-					 b_blocknr_t * new_blocknrs,
-					 b_blocknr_t start, b_blocknr_t finish,
-					 int min,
-					 int amount_needed, int prealloc_size)
+static inline int allocate_without_wrapping_disk(reiserfs_blocknr_hint_t * hint,
+						 b_blocknr_t * new_blocknrs,
+						 b_blocknr_t start,
+						 b_blocknr_t finish, int min,
+						 int amount_needed,
+						 int prealloc_size)
 {
-    int rest = amount_needed;
-    int nr_allocated;
-  
-    while (rest > 0 && start <= finish) {
-	nr_allocated = scan_bitmap (hint->th, &start, finish, min,
-				    rest + prealloc_size, !hint->formatted_node,
-				    hint->block);
-
-	if (nr_allocated == 0)	/* no new blocks allocated, return */
-	    break;
-	
-	/* fill free_blocknrs array first */
-	while (rest > 0 && nr_allocated > 0) {
-	    * new_blocknrs ++ = start ++;
-	    rest --; nr_allocated --;
-	}
+	int rest = amount_needed;
+	int nr_allocated;
+
+	while (rest > 0 && start <= finish) {
+		nr_allocated = scan_bitmap(hint->th, &start, finish, min,
+					   rest + prealloc_size,
+					   !hint->formatted_node, hint->block);
+
+		if (nr_allocated == 0)	/* no new blocks allocated, return */
+			break;
+
+		/* fill free_blocknrs array first */
+		while (rest > 0 && nr_allocated > 0) {
+			*new_blocknrs++ = start++;
+			rest--;
+			nr_allocated--;
+		}
 
-	/* do we have something to fill prealloc. array also ? */
-	if (nr_allocated > 0) {
-	    /* it means prealloc_size was greater that 0 and we do preallocation */
-	    list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
-		     &SB_JOURNAL(hint->th->t_super)->j_prealloc_list);
-	    REISERFS_I(hint->inode)->i_prealloc_block = start;
-	    REISERFS_I(hint->inode)->i_prealloc_count = nr_allocated;
-	    break;
+		/* do we have something to fill prealloc. array also ? */
+		if (nr_allocated > 0) {
+			/* it means prealloc_size was greater that 0 and we do preallocation */
+			list_add(&REISERFS_I(hint->inode)->i_prealloc_list,
+				 &SB_JOURNAL(hint->th->t_super)->
+				 j_prealloc_list);
+			REISERFS_I(hint->inode)->i_prealloc_block = start;
+			REISERFS_I(hint->inode)->i_prealloc_count =
+			    nr_allocated;
+			break;
+		}
 	}
-    }
 
-    return (amount_needed - rest);
+	return (amount_needed - rest);
 }
 
 static inline int blocknrs_and_prealloc_arrays_from_search_start
-    (reiserfs_blocknr_hint_t *hint, b_blocknr_t *new_blocknrs, int amount_needed)
-{
-    struct super_block *s = hint->th->t_super;
-    b_blocknr_t start = hint->search_start;
-    b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
-    int passno = 0;
-    int nr_allocated = 0;
-    int bigalloc = 0;
-
-    determine_prealloc_size(hint);
-    if (!hint->formatted_node) {
-        int quota_ret;
+    (reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs,
+     int amount_needed) {
+	struct super_block *s = hint->th->t_super;
+	b_blocknr_t start = hint->search_start;
+	b_blocknr_t finish = SB_BLOCK_COUNT(s) - 1;
+	int passno = 0;
+	int nr_allocated = 0;
+	int bigalloc = 0;
+
+	determine_prealloc_size(hint);
+	if (!hint->formatted_node) {
+		int quota_ret;
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating %d blocks id=%u", amount_needed, hint->inode->i_uid);
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: allocating %d blocks id=%u",
+			       amount_needed, hint->inode->i_uid);
 #endif
-	quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
-	if (quota_ret)    /* Quota exceeded? */
-	    return QUOTA_EXCEEDED;
-	if (hint->preallocate && hint->prealloc_size ) {
+		quota_ret =
+		    DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed);
+		if (quota_ret)	/* Quota exceeded? */
+			return QUOTA_EXCEEDED;
+		if (hint->preallocate && hint->prealloc_size) {
 #ifdef REISERQUOTA_DEBUG
-	    reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: allocating (prealloc) %d blocks id=%u", hint->prealloc_size, hint->inode->i_uid);
+			reiserfs_debug(s, REISERFS_DEBUG_CODE,
+				       "reiserquota: allocating (prealloc) %d blocks id=%u",
+				       hint->prealloc_size, hint->inode->i_uid);
 #endif
-	    quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size);
-	    if (quota_ret)
-		hint->preallocate=hint->prealloc_size=0;
+			quota_ret =
+			    DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode,
+							 hint->prealloc_size);
+			if (quota_ret)
+				hint->preallocate = hint->prealloc_size = 0;
+		}
+		/* for unformatted nodes, force large allocations */
+		bigalloc = amount_needed;
 	}
-	/* for unformatted nodes, force large allocations */
-	bigalloc = amount_needed;
-    }
 
-    do {
-	/* in bigalloc mode, nr_allocated should stay zero until
-	 * the entire allocation is filled
-	 */
-	if (unlikely(bigalloc && nr_allocated)) {
-	    reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
-	    bigalloc, nr_allocated);
-	    /* reset things to a sane value */
-	    bigalloc = amount_needed - nr_allocated;
-	}
-	/*
-	 * try pass 0 and pass 1 looking for a nice big
-	 * contiguous allocation.  Then reset and look
-	 * for anything you can find.
-	 */
-	if (passno == 2 && bigalloc) {
-	    passno = 0;
-	    bigalloc = 0;
-	}
-	switch (passno++) {
-        case 0: /* Search from hint->search_start to end of disk */
-	    start = hint->search_start;
-	    finish = SB_BLOCK_COUNT(s) - 1;
-	    break;
-        case 1: /* Search from hint->beg to hint->search_start */
-	    start = hint->beg;
-	    finish = hint->search_start;
-	    break;
-	case 2: /* Last chance: Search from 0 to hint->beg */
-	    start = 0;
-	    finish = hint->beg;
-	    break;
-	default: /* We've tried searching everywhere, not enough space */
-	    /* Free the blocks */
-	    if (!hint->formatted_node) {
+	do {
+		/* in bigalloc mode, nr_allocated should stay zero until
+		 * the entire allocation is filled
+		 */
+		if (unlikely(bigalloc && nr_allocated)) {
+			reiserfs_warning(s, "bigalloc is %d, nr_allocated %d\n",
+					 bigalloc, nr_allocated);
+			/* reset things to a sane value */
+			bigalloc = amount_needed - nr_allocated;
+		}
+		/*
+		 * try pass 0 and pass 1 looking for a nice big
+		 * contiguous allocation.  Then reset and look
+		 * for anything you can find.
+		 */
+		if (passno == 2 && bigalloc) {
+			passno = 0;
+			bigalloc = 0;
+		}
+		switch (passno++) {
+		case 0:	/* Search from hint->search_start to end of disk */
+			start = hint->search_start;
+			finish = SB_BLOCK_COUNT(s) - 1;
+			break;
+		case 1:	/* Search from hint->beg to hint->search_start */
+			start = hint->beg;
+			finish = hint->search_start;
+			break;
+		case 2:	/* Last chance: Search from 0 to hint->beg */
+			start = 0;
+			finish = hint->beg;
+			break;
+		default:	/* We've tried searching everywhere, not enough space */
+			/* Free the blocks */
+			if (!hint->formatted_node) {
 #ifdef REISERQUOTA_DEBUG
-		reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (nospace) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid);
+				reiserfs_debug(s, REISERFS_DEBUG_CODE,
+					       "reiserquota: freeing (nospace) %d blocks id=%u",
+					       amount_needed +
+					       hint->prealloc_size -
+					       nr_allocated,
+					       hint->inode->i_uid);
 #endif
-		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);     /* Free not allocated blocks */
-	    }
-  	    while (nr_allocated --)
-		reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node);
-
-	    return NO_DISK_SPACE;
-	}
-    } while ((nr_allocated += allocate_without_wrapping_disk (hint,
-			    new_blocknrs + nr_allocated, start, finish,
-			    bigalloc ? bigalloc : 1,
-			    amount_needed - nr_allocated,
-			    hint->prealloc_size))
-			< amount_needed);
-    if ( !hint->formatted_node &&
-         amount_needed + hint->prealloc_size >
-	 nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
-    /* Some of preallocation blocks were not allocated */
+				DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated);	/* Free not allocated blocks */
+			}
+			while (nr_allocated--)
+				reiserfs_free_block(hint->th, hint->inode,
+						    new_blocknrs[nr_allocated],
+						    !hint->formatted_node);
+
+			return NO_DISK_SPACE;
+		}
+	} while ((nr_allocated += allocate_without_wrapping_disk(hint,
+								 new_blocknrs +
+								 nr_allocated,
+								 start, finish,
+								 bigalloc ?
+								 bigalloc : 1,
+								 amount_needed -
+								 nr_allocated,
+								 hint->
+								 prealloc_size))
+		 < amount_needed);
+	if (!hint->formatted_node &&
+	    amount_needed + hint->prealloc_size >
+	    nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) {
+		/* Some of preallocation blocks were not allocated */
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug (s, REISERFS_DEBUG_CODE, "reiserquota: freeing (failed prealloc) %d blocks id=%u", amount_needed + hint->prealloc_size - nr_allocated - REISERFS_I(hint->inode)->i_prealloc_count, hint->inode->i_uid);
+		reiserfs_debug(s, REISERFS_DEBUG_CODE,
+			       "reiserquota: freeing (failed prealloc) %d blocks id=%u",
+			       amount_needed + hint->prealloc_size -
+			       nr_allocated -
+			       REISERFS_I(hint->inode)->i_prealloc_count,
+			       hint->inode->i_uid);
 #endif
-	DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
-	                         hint->prealloc_size - nr_allocated -
-				 REISERFS_I(hint->inode)->i_prealloc_count);
-    }
+		DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed +
+					 hint->prealloc_size - nr_allocated -
+					 REISERFS_I(hint->inode)->
+					 i_prealloc_count);
+	}
 
-    return CARRY_ON;
+	return CARRY_ON;
 }
 
 /* grab new blocknrs from preallocated list */
 /* return amount still needed after using them */
-static int use_preallocated_list_if_available (reiserfs_blocknr_hint_t *hint,
-					       b_blocknr_t *new_blocknrs, int amount_needed)
+static int use_preallocated_list_if_available(reiserfs_blocknr_hint_t * hint,
+					      b_blocknr_t * new_blocknrs,
+					      int amount_needed)
 {
-    struct inode * inode = hint->inode;
+	struct inode *inode = hint->inode;
 
-    if (REISERFS_I(inode)->i_prealloc_count > 0) {
-	while (amount_needed) {
+	if (REISERFS_I(inode)->i_prealloc_count > 0) {
+		while (amount_needed) {
 
-	    *new_blocknrs ++ = REISERFS_I(inode)->i_prealloc_block ++;
-	    REISERFS_I(inode)->i_prealloc_count --;
+			*new_blocknrs++ = REISERFS_I(inode)->i_prealloc_block++;
+			REISERFS_I(inode)->i_prealloc_count--;
 
-	    amount_needed --;
+			amount_needed--;
 
-	    if (REISERFS_I(inode)->i_prealloc_count <= 0) {
-		list_del(&REISERFS_I(inode)->i_prealloc_list);  
-		break;
-	    }
+			if (REISERFS_I(inode)->i_prealloc_count <= 0) {
+				list_del(&REISERFS_I(inode)->i_prealloc_list);
+				break;
+			}
+		}
 	}
-      }
-    /* return amount still needed after using preallocated blocks */
-    return amount_needed;
+	/* return amount still needed after using preallocated blocks */
+	return amount_needed;
 }
 
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *hint,
-			       b_blocknr_t * new_blocknrs, int amount_needed,
-			       int reserved_by_us /* Amount of blocks we have
-						      already reserved */)
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t * hint, b_blocknr_t * new_blocknrs, int amount_needed, int reserved_by_us	/* Amount of blocks we have
+																	   already reserved */ )
 {
-    int initial_amount_needed = amount_needed;
-    int ret;
-    struct super_block *s = hint->th->t_super;
-
-    /* Check if there is enough space, taking into account reserved space */
-    if ( SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
-	 amount_needed - reserved_by_us)
-        return NO_DISK_SPACE;
-    /* should this be if !hint->inode &&  hint->preallocate? */
-    /* do you mean hint->formatted_node can be removed ? - Zam */
-    /* hint->formatted_node cannot be removed because we try to access
-       inode information here, and there is often no inode assotiated with
-       metadata allocations - green */
-
-    if (!hint->formatted_node && hint->preallocate) {
-	amount_needed = use_preallocated_list_if_available
+	int initial_amount_needed = amount_needed;
+	int ret;
+	struct super_block *s = hint->th->t_super;
+
+	/* Check if there is enough space, taking into account reserved space */
+	if (SB_FREE_BLOCKS(s) - REISERFS_SB(s)->reserved_blocks <
+	    amount_needed - reserved_by_us)
+		return NO_DISK_SPACE;
+	/* should this be if !hint->inode &&  hint->preallocate? */
+	/* do you mean hint->formatted_node can be removed ? - Zam */
+	/* hint->formatted_node cannot be removed because we try to access
+	   inode information here, and there is often no inode assotiated with
+	   metadata allocations - green */
+
+	if (!hint->formatted_node && hint->preallocate) {
+		amount_needed = use_preallocated_list_if_available
+		    (hint, new_blocknrs, amount_needed);
+		if (amount_needed == 0)	/* all blocknrs we need we got from
+					   prealloc. list */
+			return CARRY_ON;
+		new_blocknrs += (initial_amount_needed - amount_needed);
+	}
+
+	/* find search start and save it in hint structure */
+	determine_search_start(hint, amount_needed);
+	if (hint->search_start >= SB_BLOCK_COUNT(s))
+		hint->search_start = SB_BLOCK_COUNT(s) - 1;
+
+	/* allocation itself; fill new_blocknrs and preallocation arrays */
+	ret = blocknrs_and_prealloc_arrays_from_search_start
 	    (hint, new_blocknrs, amount_needed);
-	if (amount_needed == 0)	/* all blocknrs we need we got from
-                                   prealloc. list */
-	    return CARRY_ON;
-	new_blocknrs += (initial_amount_needed - amount_needed);
-    }
-
-    /* find search start and save it in hint structure */
-    determine_search_start(hint, amount_needed);
-    if (hint->search_start >= SB_BLOCK_COUNT(s))
-        hint->search_start = SB_BLOCK_COUNT(s) - 1;
-
-    /* allocation itself; fill new_blocknrs and preallocation arrays */
-    ret = blocknrs_and_prealloc_arrays_from_search_start
-	(hint, new_blocknrs, amount_needed);
-
-    /* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
-     * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
-     * variant) */
-
-    if (ret != CARRY_ON) {
-	while (amount_needed ++ < initial_amount_needed) {
-	    reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1);
+
+	/* we used prealloc. list to fill (partially) new_blocknrs array. If final allocation fails we
+	 * need to return blocks back to prealloc. list or just free them. -- Zam (I chose second
+	 * variant) */
+
+	if (ret != CARRY_ON) {
+		while (amount_needed++ < initial_amount_needed) {
+			reiserfs_free_block(hint->th, hint->inode,
+					    *(--new_blocknrs), 1);
+		}
 	}
-    }
-    return ret;
+	return ret;
 }
 
 /* These 2 functions are here to provide blocks reservation to the rest of kernel */
 /* Reserve @blocks amount of blocks in fs pointed by @sb. Caller must make sure
    there are actually this much blocks on the FS available */
-void reiserfs_claim_blocks_to_be_allocated( 
-				      struct super_block *sb, /* super block of
-							        filesystem where
-								blocks should be
-								reserved */
-				      int blocks /* How much to reserve */
-					  )
+void reiserfs_claim_blocks_to_be_allocated(struct super_block *sb,	/* super block of
+									   filesystem where
+									   blocks should be
+									   reserved */
+					   int blocks	/* How much to reserve */
+    )
 {
 
-    /* Fast case, if reservation is zero - exit immediately. */
-    if ( !blocks )
-	return;
+	/* Fast case, if reservation is zero - exit immediately. */
+	if (!blocks)
+		return;
 
-    spin_lock(&REISERFS_SB(sb)->bitmap_lock);
-    REISERFS_SB(sb)->reserved_blocks += blocks;
-    spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
+	spin_lock(&REISERFS_SB(sb)->bitmap_lock);
+	REISERFS_SB(sb)->reserved_blocks += blocks;
+	spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
 }
 
 /* Unreserve @blocks amount of blocks in fs pointed by @sb */
-void reiserfs_release_claimed_blocks( 
-				struct super_block *sb, /* super block of
-							  filesystem where
-							  blocks should be
-							  reserved */
-				int blocks /* How much to unreserve */
-					  )
+void reiserfs_release_claimed_blocks(struct super_block *sb,	/* super block of
+								   filesystem where
+								   blocks should be
+								   reserved */
+				     int blocks	/* How much to unreserve */
+    )
 {
 
-    /* Fast case, if unreservation is zero - exit immediately. */
-    if ( !blocks )
-	return;
+	/* Fast case, if unreservation is zero - exit immediately. */
+	if (!blocks)
+		return;
 
-    spin_lock(&REISERFS_SB(sb)->bitmap_lock);
-    REISERFS_SB(sb)->reserved_blocks -= blocks;
-    spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
-    RFALSE( REISERFS_SB(sb)->reserved_blocks < 0, "amount of blocks reserved became zero?");
+	spin_lock(&REISERFS_SB(sb)->bitmap_lock);
+	REISERFS_SB(sb)->reserved_blocks -= blocks;
+	spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
+	RFALSE(REISERFS_SB(sb)->reserved_blocks < 0,
+	       "amount of blocks reserved became zero?");
 }
 
 /* This function estimates how much pages we will be able to write to FS
    used for reiserfs_file_write() purposes for now. */
-int reiserfs_can_fit_pages ( struct super_block *sb /* superblock of filesystem
-						       to estimate space */ )
+int reiserfs_can_fit_pages(struct super_block *sb	/* superblock of filesystem
+							   to estimate space */ )
 {
 	int space;
 
 	spin_lock(&REISERFS_SB(sb)->bitmap_lock);
-	space = (SB_FREE_BLOCKS(sb) - REISERFS_SB(sb)->reserved_blocks) >> ( PAGE_CACHE_SHIFT - sb->s_blocksize_bits);
+	space =
+	    (SB_FREE_BLOCKS(sb) -
+	     REISERFS_SB(sb)->reserved_blocks) >> (PAGE_CACHE_SHIFT -
+						   sb->s_blocksize_bits);
 	spin_unlock(&REISERFS_SB(sb)->bitmap_lock);
 
-	return space>0?space:0;
+	return space > 0 ? space : 0;
 }
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index fbde4b01a325..9dd71e807034 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -12,264 +12,286 @@
 #include <linux/buffer_head.h>
 #include <asm/uaccess.h>
 
-extern struct reiserfs_key  MIN_KEY;
+extern struct reiserfs_key MIN_KEY;
 
-static int reiserfs_readdir (struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ;
+static int reiserfs_readdir(struct file *, void *, filldir_t);
+static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+			      int datasync);
 
 struct file_operations reiserfs_dir_operations = {
-    .read	= generic_read_dir,
-    .readdir	= reiserfs_readdir,
-    .fsync	= reiserfs_dir_fsync,
-    .ioctl	= reiserfs_ioctl,
+	.read = generic_read_dir,
+	.readdir = reiserfs_readdir,
+	.fsync = reiserfs_dir_fsync,
+	.ioctl = reiserfs_ioctl,
 };
 
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) {
-  struct inode *inode = dentry->d_inode;
-  int err;
-  reiserfs_write_lock(inode->i_sb);
-  err = reiserfs_commit_for_inode(inode) ;
-  reiserfs_write_unlock(inode->i_sb) ;
-  if (err < 0)
-      return err;
-  return 0;
+static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+			      int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int err;
+	reiserfs_write_lock(inode->i_sb);
+	err = reiserfs_commit_for_inode(inode);
+	reiserfs_write_unlock(inode->i_sb);
+	if (err < 0)
+		return err;
+	return 0;
 }
 
-
 #define store_ih(where,what) copy_item_head (where, what)
 
 //
-static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir)
+static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-    struct inode *inode = filp->f_dentry->d_inode;
-    struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
-    INITIALIZE_PATH (path_to_entry);
-    struct buffer_head * bh;
-    int item_num, entry_num;
-    const struct reiserfs_key * rkey;
-    struct item_head * ih, tmp_ih;
-    int search_res;
-    char * local_buf;
-    loff_t next_pos;
-    char small_buf[32] ; /* avoid kmalloc if we can */
-    struct reiserfs_dir_entry de;
-    int ret = 0;
-
-    reiserfs_write_lock(inode->i_sb);
-
-    reiserfs_check_lock_depth(inode->i_sb, "readdir") ;
-
-    /* form key for search the next directory entry using f_pos field of
-       file structure */
-    make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET,
-		  TYPE_DIRENTRY, 3);
-    next_pos = cpu_key_k_offset (&pos_key);
-
-    /*  reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos);*/
-
-    path_to_entry.reada = PATH_READA;
-    while (1) {
-    research:
-	/* search the directory item, containing entry with specified key */
-	search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de);
-	if (search_res == IO_ERROR) {
-	    // FIXME: we could just skip part of directory which could
-	    // not be read
-	    ret = -EIO;
-	    goto out;
-	}
-	entry_num = de.de_entry_num;
-	bh = de.de_bh;
-	item_num = de.de_item_num;
-	ih = de.de_ih;
-	store_ih (&tmp_ih, ih);
-		
-	/* we must have found item, that is item of this directory, */
-	RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
-		"vs-9000: found item %h does not match to dir we readdir %K",
-		ih, &pos_key);
-	RFALSE( item_num > B_NR_ITEMS (bh) - 1,
-		"vs-9005 item_num == %d, item amount == %d", 
-		item_num, B_NR_ITEMS (bh));
-      
-	/* and entry must be not more than number of entries in the item */
-	RFALSE( I_ENTRY_COUNT (ih) < entry_num,
-		"vs-9010: entry number is too big %d (%d)", 
-		entry_num, I_ENTRY_COUNT (ih));
-
-	if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) {
-	    /* go through all entries in the directory item beginning from the entry, that has been found */
-	    struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num;
-
-	    for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) {
-		int d_reclen;
-		char * d_name;
-		off_t d_off;
-		ino_t d_ino;
-
-		if (!de_visible (deh))
-		    /* it is hidden entry */
-		    continue;
-		d_reclen = entry_length (bh, ih, entry_num);
-		d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
-		if (!d_name[d_reclen - 1])
-		    d_reclen = strlen (d_name);
-	
-		if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
-		    /* too big to send back to VFS */
-		    continue ;
-		}
-
-                /* Ignore the .reiserfs_priv entry */
-                if (reiserfs_xattrs (inode->i_sb) &&
-                    !old_format_only(inode->i_sb) &&
-                    filp->f_dentry == inode->i_sb->s_root &&
-                    REISERFS_SB(inode->i_sb)->priv_root &&
-                    REISERFS_SB(inode->i_sb)->priv_root->d_inode &&
-                    deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) {
-                  continue;
-                }
-
-		d_off = deh_offset (deh);
-		filp->f_pos = d_off ;
-		d_ino = deh_objectid (deh);
-		if (d_reclen <= 32) {
-		  local_buf = small_buf ;
-		} else {
-		    local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
-		    if (!local_buf) {
-			pathrelse (&path_to_entry);
-			ret = -ENOMEM ;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
+	INITIALIZE_PATH(path_to_entry);
+	struct buffer_head *bh;
+	int item_num, entry_num;
+	const struct reiserfs_key *rkey;
+	struct item_head *ih, tmp_ih;
+	int search_res;
+	char *local_buf;
+	loff_t next_pos;
+	char small_buf[32];	/* avoid kmalloc if we can */
+	struct reiserfs_dir_entry de;
+	int ret = 0;
+
+	reiserfs_write_lock(inode->i_sb);
+
+	reiserfs_check_lock_depth(inode->i_sb, "readdir");
+
+	/* form key for search the next directory entry using f_pos field of
+	   file structure */
+	make_cpu_key(&pos_key, inode,
+		     (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, TYPE_DIRENTRY,
+		     3);
+	next_pos = cpu_key_k_offset(&pos_key);
+
+	/*  reiserfs_warning (inode->i_sb, "reiserfs_readdir 1: f_pos = %Ld", filp->f_pos); */
+
+	path_to_entry.reada = PATH_READA;
+	while (1) {
+	      research:
+		/* search the directory item, containing entry with specified key */
+		search_res =
+		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
+					&de);
+		if (search_res == IO_ERROR) {
+			// FIXME: we could just skip part of directory which could
+			// not be read
+			ret = -EIO;
 			goto out;
-		    }
-		    if (item_moved (&tmp_ih, &path_to_entry)) {
-			reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
-			goto research;
-		    }
-		}
-		// Note, that we copy name to user space via temporary
-		// buffer (local_buf) because filldir will block if
-		// user space buffer is swapped out. At that time
-		// entry can move to somewhere else
-		memcpy (local_buf, d_name, d_reclen);
-		if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, 
-		             DT_UNKNOWN) < 0) {
-		    if (local_buf != small_buf) {
-			reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
-		    }
-		    goto end;
 		}
-		if (local_buf != small_buf) {
-		    reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
+		entry_num = de.de_entry_num;
+		bh = de.de_bh;
+		item_num = de.de_item_num;
+		ih = de.de_ih;
+		store_ih(&tmp_ih, ih);
+
+		/* we must have found item, that is item of this directory, */
+		RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
+		       "vs-9000: found item %h does not match to dir we readdir %K",
+		       ih, &pos_key);
+		RFALSE(item_num > B_NR_ITEMS(bh) - 1,
+		       "vs-9005 item_num == %d, item amount == %d",
+		       item_num, B_NR_ITEMS(bh));
+
+		/* and entry must be not more than number of entries in the item */
+		RFALSE(I_ENTRY_COUNT(ih) < entry_num,
+		       "vs-9010: entry number is too big %d (%d)",
+		       entry_num, I_ENTRY_COUNT(ih));
+
+		if (search_res == POSITION_FOUND
+		    || entry_num < I_ENTRY_COUNT(ih)) {
+			/* go through all entries in the directory item beginning from the entry, that has been found */
+			struct reiserfs_de_head *deh =
+			    B_I_DEH(bh, ih) + entry_num;
+
+			for (; entry_num < I_ENTRY_COUNT(ih);
+			     entry_num++, deh++) {
+				int d_reclen;
+				char *d_name;
+				off_t d_off;
+				ino_t d_ino;
+
+				if (!de_visible(deh))
+					/* it is hidden entry */
+					continue;
+				d_reclen = entry_length(bh, ih, entry_num);
+				d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
+				if (!d_name[d_reclen - 1])
+					d_reclen = strlen(d_name);
+
+				if (d_reclen >
+				    REISERFS_MAX_NAME(inode->i_sb->
+						      s_blocksize)) {
+					/* too big to send back to VFS */
+					continue;
+				}
+
+				/* Ignore the .reiserfs_priv entry */
+				if (reiserfs_xattrs(inode->i_sb) &&
+				    !old_format_only(inode->i_sb) &&
+				    filp->f_dentry == inode->i_sb->s_root &&
+				    REISERFS_SB(inode->i_sb)->priv_root &&
+				    REISERFS_SB(inode->i_sb)->priv_root->d_inode
+				    && deh_objectid(deh) ==
+				    le32_to_cpu(INODE_PKEY
+						(REISERFS_SB(inode->i_sb)->
+						 priv_root->d_inode)->
+						k_objectid)) {
+					continue;
+				}
+
+				d_off = deh_offset(deh);
+				filp->f_pos = d_off;
+				d_ino = deh_objectid(deh);
+				if (d_reclen <= 32) {
+					local_buf = small_buf;
+				} else {
+					local_buf =
+					    reiserfs_kmalloc(d_reclen, GFP_NOFS,
+							     inode->i_sb);
+					if (!local_buf) {
+						pathrelse(&path_to_entry);
+						ret = -ENOMEM;
+						goto out;
+					}
+					if (item_moved(&tmp_ih, &path_to_entry)) {
+						reiserfs_kfree(local_buf,
+							       d_reclen,
+							       inode->i_sb);
+						goto research;
+					}
+				}
+				// Note, that we copy name to user space via temporary
+				// buffer (local_buf) because filldir will block if
+				// user space buffer is swapped out. At that time
+				// entry can move to somewhere else
+				memcpy(local_buf, d_name, d_reclen);
+				if (filldir
+				    (dirent, local_buf, d_reclen, d_off, d_ino,
+				     DT_UNKNOWN) < 0) {
+					if (local_buf != small_buf) {
+						reiserfs_kfree(local_buf,
+							       d_reclen,
+							       inode->i_sb);
+					}
+					goto end;
+				}
+				if (local_buf != small_buf) {
+					reiserfs_kfree(local_buf, d_reclen,
+						       inode->i_sb);
+				}
+				// next entry should be looked for with such offset
+				next_pos = deh_offset(deh) + 1;
+
+				if (item_moved(&tmp_ih, &path_to_entry)) {
+					goto research;
+				}
+			}	/* for */
 		}
 
-		// next entry should be looked for with such offset
-		next_pos = deh_offset (deh) + 1;
+		if (item_num != B_NR_ITEMS(bh) - 1)
+			// end of directory has been reached
+			goto end;
+
+		/* item we went through is last item of node. Using right
+		   delimiting key check is it directory end */
+		rkey = get_rkey(&path_to_entry, inode->i_sb);
+		if (!comp_le_keys(rkey, &MIN_KEY)) {
+			/* set pos_key to key, that is the smallest and greater
+			   that key of the last entry in the item */
+			set_cpu_key_k_offset(&pos_key, next_pos);
+			continue;
+		}
 
-		if (item_moved (&tmp_ih, &path_to_entry)) {
-		    goto research;
+		if (COMP_SHORT_KEYS(rkey, &pos_key)) {
+			// end of directory has been reached
+			goto end;
 		}
-	    } /* for */
-	}
-
-	if (item_num != B_NR_ITEMS (bh) - 1)
-	    // end of directory has been reached
-	    goto end;
-
-	/* item we went through is last item of node. Using right
-	   delimiting key check is it directory end */
-	rkey = get_rkey (&path_to_entry, inode->i_sb);
-	if (! comp_le_keys (rkey, &MIN_KEY)) {
-	    /* set pos_key to key, that is the smallest and greater
-	       that key of the last entry in the item */
-	    set_cpu_key_k_offset (&pos_key, next_pos);
-	    continue;
-	}
-
-	if ( COMP_SHORT_KEYS (rkey, &pos_key)) {
-	    // end of directory has been reached
-	    goto end;
-	}
-	
-	/* directory continues in the right neighboring block */
-	set_cpu_key_k_offset (&pos_key, le_key_k_offset (KEY_FORMAT_3_5, rkey));
-
-    } /* while */
-
-
- end:
-    filp->f_pos = next_pos;
-    pathrelse (&path_to_entry);
-    reiserfs_check_path(&path_to_entry) ;
- out:
-    reiserfs_write_unlock(inode->i_sb);
-    return ret;
+
+		/* directory continues in the right neighboring block */
+		set_cpu_key_k_offset(&pos_key,
+				     le_key_k_offset(KEY_FORMAT_3_5, rkey));
+
+	}			/* while */
+
+      end:
+	filp->f_pos = next_pos;
+	pathrelse(&path_to_entry);
+	reiserfs_check_path(&path_to_entry);
+      out:
+	reiserfs_write_unlock(inode->i_sb);
+	return ret;
 }
 
 /* compose directory item containing "." and ".." entries (entries are
    not aligned to 4 byte boundary) */
 /* the last four params are LE */
-void make_empty_dir_item_v1 (char * body, __le32 dirid, __le32 objid,
-			     __le32 par_dirid, __le32 par_objid)
+void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+			    __le32 par_dirid, __le32 par_objid)
 {
-    struct reiserfs_de_head * deh;
-
-    memset (body, 0, EMPTY_DIR_SIZE_V1);
-    deh = (struct reiserfs_de_head *)body;
-    
-    /* direntry header of "." */
-    put_deh_offset( &(deh[0]), DOT_OFFSET );
-    /* these two are from make_le_item_head, and are are LE */
-    deh[0].deh_dir_id = dirid;
-    deh[0].deh_objectid = objid;
-    deh[0].deh_state = 0; /* Endian safe if 0 */
-    put_deh_location( &(deh[0]), EMPTY_DIR_SIZE_V1 - strlen( "." ));
-    mark_de_visible(&(deh[0]));
-  
-    /* direntry header of ".." */
-    put_deh_offset( &(deh[1]), DOT_DOT_OFFSET);
-    /* key of ".." for the root directory */
-    /* these two are from the inode, and are are LE */
-    deh[1].deh_dir_id = par_dirid;
-    deh[1].deh_objectid = par_objid;
-    deh[1].deh_state = 0; /* Endian safe if 0 */
-    put_deh_location( &(deh[1]), deh_location( &(deh[0]) ) - strlen( ".." ) );
-    mark_de_visible(&(deh[1]));
-
-    /* copy ".." and "." */
-    memcpy (body + deh_location( &(deh[0]) ), ".", 1);
-    memcpy (body + deh_location( &(deh[1]) ), "..", 2);
+	struct reiserfs_de_head *deh;
+
+	memset(body, 0, EMPTY_DIR_SIZE_V1);
+	deh = (struct reiserfs_de_head *)body;
+
+	/* direntry header of "." */
+	put_deh_offset(&(deh[0]), DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	deh[0].deh_dir_id = dirid;
+	deh[0].deh_objectid = objid;
+	deh[0].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[0]), EMPTY_DIR_SIZE_V1 - strlen("."));
+	mark_de_visible(&(deh[0]));
+
+	/* direntry header of ".." */
+	put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	deh[1].deh_dir_id = par_dirid;
+	deh[1].deh_objectid = par_objid;
+	deh[1].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[1]), deh_location(&(deh[0])) - strlen(".."));
+	mark_de_visible(&(deh[1]));
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(&(deh[0])), ".", 1);
+	memcpy(body + deh_location(&(deh[1])), "..", 2);
 }
 
 /* compose directory item containing "." and ".." entries */
-void make_empty_dir_item (char * body, __le32 dirid, __le32 objid,
-			  __le32 par_dirid, __le32 par_objid)
+void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+			 __le32 par_dirid, __le32 par_objid)
 {
-    struct reiserfs_de_head * deh;
-
-    memset (body, 0, EMPTY_DIR_SIZE);
-    deh = (struct reiserfs_de_head *)body;
-    
-    /* direntry header of "." */
-    put_deh_offset( &(deh[0]), DOT_OFFSET );
-    /* these two are from make_le_item_head, and are are LE */
-    deh[0].deh_dir_id = dirid;
-    deh[0].deh_objectid = objid;
-    deh[0].deh_state = 0; /* Endian safe if 0 */
-    put_deh_location( &(deh[0]), EMPTY_DIR_SIZE - ROUND_UP( strlen( "." ) ) );
-    mark_de_visible(&(deh[0]));
-  
-    /* direntry header of ".." */
-    put_deh_offset( &(deh[1]), DOT_DOT_OFFSET );
-    /* key of ".." for the root directory */
-    /* these two are from the inode, and are are LE */
-    deh[1].deh_dir_id = par_dirid;
-    deh[1].deh_objectid = par_objid;
-    deh[1].deh_state = 0; /* Endian safe if 0 */
-    put_deh_location( &(deh[1]), deh_location( &(deh[0])) - ROUND_UP( strlen( ".." ) ) );
-    mark_de_visible(&(deh[1]));
-
-    /* copy ".." and "." */
-    memcpy (body + deh_location( &(deh[0]) ), ".", 1);
-    memcpy (body + deh_location( &(deh[1]) ), "..", 2);
+	struct reiserfs_de_head *deh;
+
+	memset(body, 0, EMPTY_DIR_SIZE);
+	deh = (struct reiserfs_de_head *)body;
+
+	/* direntry header of "." */
+	put_deh_offset(&(deh[0]), DOT_OFFSET);
+	/* these two are from make_le_item_head, and are are LE */
+	deh[0].deh_dir_id = dirid;
+	deh[0].deh_objectid = objid;
+	deh[0].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[0]), EMPTY_DIR_SIZE - ROUND_UP(strlen(".")));
+	mark_de_visible(&(deh[0]));
+
+	/* direntry header of ".." */
+	put_deh_offset(&(deh[1]), DOT_DOT_OFFSET);
+	/* key of ".." for the root directory */
+	/* these two are from the inode, and are are LE */
+	deh[1].deh_dir_id = par_dirid;
+	deh[1].deh_objectid = par_objid;
+	deh[1].deh_state = 0;	/* Endian safe if 0 */
+	put_deh_location(&(deh[1]),
+			 deh_location(&(deh[0])) - ROUND_UP(strlen("..")));
+	mark_de_visible(&(deh[1]));
+
+	/* copy ".." and "." */
+	memcpy(body + deh_location(&(deh[0])), ".", 1);
+	memcpy(body + deh_location(&(deh[1])), "..", 2);
 }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 2118db2896c7..b2264ba3cc56 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -8,7 +8,6 @@
 /* balance the tree according to the analysis made before,		*/
 /* and using buffers obtained after all above.				*/
 
-
 /**
  ** balance_leaf_when_delete
  ** balance_leaf
@@ -24,23 +23,22 @@
 
 #ifdef CONFIG_REISERFS_CHECK
 
-struct tree_balance * cur_tb = NULL; /* detects whether more than one
-                                        copy of tb exists as a means
-                                        of checking whether schedule
-                                        is interrupting do_balance */
+struct tree_balance *cur_tb = NULL;	/* detects whether more than one
+					   copy of tb exists as a means
+					   of checking whether schedule
+					   is interrupting do_balance */
 #endif
 
-inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
-					struct buffer_head * bh, int flag)
+inline void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+				       struct buffer_head *bh, int flag)
 {
-    journal_mark_dirty(tb->transaction_handle,
-                       tb->transaction_handle->t_super, bh) ;
+	journal_mark_dirty(tb->transaction_handle,
+			   tb->transaction_handle->t_super, bh);
 }
 
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
 #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
 
-
 /* summary: 
  if deleting something ( tb->insert_size[0] < 0 )
    return(balance_leaf_when_delete()); (flag d handled here)
@@ -64,8 +62,6 @@ be performed by do_balance.
 
 -Hans */
 
-
-
 /* Balance leaf node in case of delete or cut: insert_size[0] < 0
  *
  * lnum, rnum can have values >= -1
@@ -73,1384 +69,1933 @@ be performed by do_balance.
  *	 0 means that nothing should be done with the neighbor
  *	>0 means to shift entirely or partly the specified number of items to the neighbor
  */
-static int balance_leaf_when_delete (struct tree_balance * tb, int flag)
+static int balance_leaf_when_delete(struct tree_balance *tb, int flag)
 {
-    struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
-    int item_pos = PATH_LAST_POSITION (tb->tb_path);
-    int pos_in_item = tb->tb_path->pos_in_item;
-    struct buffer_info bi;
-    int n;
-    struct item_head * ih;
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);
+	int pos_in_item = tb->tb_path->pos_in_item;
+	struct buffer_info bi;
+	int n;
+	struct item_head *ih;
 
-    RFALSE( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
-	    "vs- 12000: level: wrong FR %z", tb->FR[0]);
-    RFALSE( tb->blknum[0] > 1,
-	    "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
-    RFALSE( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0),
-	    "PAP-12010: tree can not be empty");
+	RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1,
+	       "vs- 12000: level: wrong FR %z", tb->FR[0]);
+	RFALSE(tb->blknum[0] > 1,
+	       "PAP-12005: tb->blknum == %d, can not be > 1", tb->blknum[0]);
+	RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0),
+	       "PAP-12010: tree can not be empty");
 
-    ih = B_N_PITEM_HEAD (tbS0, item_pos);
+	ih = B_N_PITEM_HEAD(tbS0, item_pos);
 
-    /* Delete or truncate the item */
+	/* Delete or truncate the item */
 
-    switch (flag) {
-    case M_DELETE:   /* delete item in S[0] */
+	switch (flag) {
+	case M_DELETE:		/* delete item in S[0] */
+
+		RFALSE(ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
+		       "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
+		       -tb->insert_size[0], ih);
+
+		bi.tb = tb;
+		bi.bi_bh = tbS0;
+		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
+		leaf_delete_items(&bi, 0, item_pos, 1, -1);
+
+		if (!item_pos && tb->CFL[0]) {
+			if (B_NR_ITEMS(tbS0)) {
+				replace_key(tb, tb->CFL[0], tb->lkey[0], tbS0,
+					    0);
+			} else {
+				if (!PATH_H_POSITION(tb->tb_path, 1))
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    PATH_H_PPARENT(tb->tb_path,
+								   0), 0);
+			}
+		}
 
-	RFALSE( ih_item_len(ih) + IH_SIZE != -tb->insert_size[0],
-	        "vs-12013: mode Delete, insert size %d, ih to be deleted %h",
- 		 -tb->insert_size [0], ih);
+		RFALSE(!item_pos && !tb->CFL[0],
+		       "PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0],
+		       tb->L[0]);
 
-	bi.tb = tb;
-	bi.bi_bh = tbS0;
-	bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
-	leaf_delete_items (&bi, 0, item_pos, 1, -1);
-
-	if ( ! item_pos && tb->CFL[0] ) {
-	    if ( B_NR_ITEMS(tbS0) ) {
-		replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
-	    }
-	    else {
-		if ( ! PATH_H_POSITION (tb->tb_path, 1) )
-		    replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0);
-	    }
-	} 
-
-	RFALSE( ! item_pos && !tb->CFL[0],
-		"PAP-12020: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]);
-    
-	break;
-
-    case M_CUT: {  /* cut item in S[0] */
-	bi.tb = tb;
-	bi.bi_bh = tbS0;
-	bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
-	if (is_direntry_le_ih (ih)) {
-
-	    /* UFS unlink semantics are such that you can only delete one directory entry at a time. */
-	    /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
-	    tb->insert_size[0] = -1;
-	    leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
-
-	    RFALSE( ! item_pos && ! pos_in_item && ! tb->CFL[0],
-		    "PAP-12030: can not change delimiting key. CFL[0]=%p", 
-		    tb->CFL[0]);
-
-	    if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) {
-		replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0);
-	    }
-	} else {
-	    leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]);
-
-	    RFALSE( ! ih_item_len(ih),
-		"PAP-12035: cut must leave non-zero dynamic length of item");
-	}
-	break;
-    }
-
-    default:
-	print_cur_tb ("12040");
-	reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
-			(flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag);
-    }
-
-    /* the rule is that no shifting occurs unless by shifting a node can be freed */
-    n = B_NR_ITEMS(tbS0);
-    if ( tb->lnum[0] )     /* L[0] takes part in balancing */
-    {
-	if ( tb->lnum[0] == -1 )    /* L[0] must be joined with S[0] */
-	{
-	    if ( tb->rnum[0] == -1 )    /* R[0] must be also joined with S[0] */
-	    {			
-		if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) )
-		{
-		    /* all contents of all the 3 buffers will be in L[0] */
-		    if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) )
-			replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1);
-
-		    leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, NULL);
-		    leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, NULL);
-
-		    reiserfs_invalidate_buffer (tb, tbS0);
-		    reiserfs_invalidate_buffer (tb, tb->R[0]);
-
-		    return 0;
+		break;
+
+	case M_CUT:{		/* cut item in S[0] */
+			bi.tb = tb;
+			bi.bi_bh = tbS0;
+			bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+			bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
+			if (is_direntry_le_ih(ih)) {
+
+				/* UFS unlink semantics are such that you can only delete one directory entry at a time. */
+				/* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */
+				tb->insert_size[0] = -1;
+				leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+						     -tb->insert_size[0]);
+
+				RFALSE(!item_pos && !pos_in_item && !tb->CFL[0],
+				       "PAP-12030: can not change delimiting key. CFL[0]=%p",
+				       tb->CFL[0]);
+
+				if (!item_pos && !pos_in_item && tb->CFL[0]) {
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    tbS0, 0);
+				}
+			} else {
+				leaf_cut_from_buffer(&bi, item_pos, pos_in_item,
+						     -tb->insert_size[0]);
+
+				RFALSE(!ih_item_len(ih),
+				       "PAP-12035: cut must leave non-zero dynamic length of item");
+			}
+			break;
 		}
-		/* all contents of all the 3 buffers will be in R[0] */
-		leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, NULL);
-		leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, NULL);
 
-		/* right_delimiting_key is correct in R[0] */
-		replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+	default:
+		print_cur_tb("12040");
+		reiserfs_panic(tb->tb_sb,
+			       "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)",
+			       (flag ==
+				M_PASTE) ? "PASTE" : ((flag ==
+						       M_INSERT) ? "INSERT" :
+						      "UNKNOWN"), flag);
+	}
 
-		reiserfs_invalidate_buffer (tb, tbS0);
-		reiserfs_invalidate_buffer (tb, tb->L[0]);
+	/* the rule is that no shifting occurs unless by shifting a node can be freed */
+	n = B_NR_ITEMS(tbS0);
+	if (tb->lnum[0]) {	/* L[0] takes part in balancing */
+		if (tb->lnum[0] == -1) {	/* L[0] must be joined with S[0] */
+			if (tb->rnum[0] == -1) {	/* R[0] must be also joined with S[0] */
+				if (tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0)) {
+					/* all contents of all the 3 buffers will be in L[0] */
+					if (PATH_H_POSITION(tb->tb_path, 1) == 0
+					    && 1 < B_NR_ITEMS(tb->FR[0]))
+						replace_key(tb, tb->CFL[0],
+							    tb->lkey[0],
+							    tb->FR[0], 1);
+
+					leaf_move_items(LEAF_FROM_S_TO_L, tb, n,
+							-1, NULL);
+					leaf_move_items(LEAF_FROM_R_TO_L, tb,
+							B_NR_ITEMS(tb->R[0]),
+							-1, NULL);
+
+					reiserfs_invalidate_buffer(tb, tbS0);
+					reiserfs_invalidate_buffer(tb,
+								   tb->R[0]);
+
+					return 0;
+				}
+				/* all contents of all the 3 buffers will be in R[0] */
+				leaf_move_items(LEAF_FROM_S_TO_R, tb, n, -1,
+						NULL);
+				leaf_move_items(LEAF_FROM_L_TO_R, tb,
+						B_NR_ITEMS(tb->L[0]), -1, NULL);
+
+				/* right_delimiting_key is correct in R[0] */
+				replace_key(tb, tb->CFR[0], tb->rkey[0],
+					    tb->R[0], 0);
 
-		return -1;
-	    }
+				reiserfs_invalidate_buffer(tb, tbS0);
+				reiserfs_invalidate_buffer(tb, tb->L[0]);
 
-	    RFALSE( tb->rnum[0] != 0, 
-		    "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
-	    /* all contents of L[0] and S[0] will be in L[0] */
-	    leaf_shift_left(tb, n, -1);
+				return -1;
+			}
 
-	    reiserfs_invalidate_buffer (tb, tbS0);
+			RFALSE(tb->rnum[0] != 0,
+			       "PAP-12045: rnum must be 0 (%d)", tb->rnum[0]);
+			/* all contents of L[0] and S[0] will be in L[0] */
+			leaf_shift_left(tb, n, -1);
 
-	    return 0;
+			reiserfs_invalidate_buffer(tb, tbS0);
+
+			return 0;
+		}
+		/* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
+
+		RFALSE((tb->lnum[0] + tb->rnum[0] < n) ||
+		       (tb->lnum[0] + tb->rnum[0] > n + 1),
+		       "PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
+		       tb->rnum[0], tb->lnum[0], n);
+		RFALSE((tb->lnum[0] + tb->rnum[0] == n) &&
+		       (tb->lbytes != -1 || tb->rbytes != -1),
+		       "PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split",
+		       tb->rbytes, tb->lbytes);
+		RFALSE((tb->lnum[0] + tb->rnum[0] == n + 1) &&
+		       (tb->lbytes < 1 || tb->rbytes != -1),
+		       "PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split",
+		       tb->rbytes, tb->lbytes);
+
+		leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
+		leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
+
+		reiserfs_invalidate_buffer(tb, tbS0);
+
+		return 0;
 	}
-	/* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */
-
-	RFALSE( ( tb->lnum[0] + tb->rnum[0] < n ) || 
-		( tb->lnum[0] + tb->rnum[0] > n+1 ),
-		"PAP-12050: rnum(%d) and lnum(%d) and item number(%d) in S[0] are not consistent",
-		tb->rnum[0], tb->lnum[0], n);
-	RFALSE( ( tb->lnum[0] + tb->rnum[0] == n ) && 
-		(tb->lbytes != -1 || tb->rbytes != -1),
-		"PAP-12055: bad rbytes (%d)/lbytes (%d) parameters when items are not split", 
-		tb->rbytes, tb->lbytes);
-	RFALSE( ( tb->lnum[0] + tb->rnum[0] == n + 1 ) && 
-		(tb->lbytes < 1 || tb->rbytes != -1),
-		"PAP-12060: bad rbytes (%d)/lbytes (%d) parameters when items are split", 
-		tb->rbytes, tb->lbytes);
-
-	leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
-	leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
-
-	reiserfs_invalidate_buffer (tb, tbS0);
 
-	return 0;
-    }
+	if (tb->rnum[0] == -1) {
+		/* all contents of R[0] and S[0] will be in R[0] */
+		leaf_shift_right(tb, n, -1);
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
 
-    if ( tb->rnum[0] == -1 ) {
-	/* all contents of R[0] and S[0] will be in R[0] */
-	leaf_shift_right(tb, n, -1);
-	reiserfs_invalidate_buffer (tb, tbS0);
+	RFALSE(tb->rnum[0],
+	       "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
 	return 0;
-    }
-
-    RFALSE( tb->rnum[0], 
-	    "PAP-12065: bad rnum parameter must be 0 (%d)", tb->rnum[0]);
-    return 0;
 }
 
-
-static int balance_leaf (struct tree_balance * tb,
-			 struct item_head * ih,		/* item header of inserted item (this is on little endian) */
-			 const char * body,		/* body  of inserted item or bytes to paste */
-			 int flag,			/* i - insert, d - delete, c - cut, p - paste
-							   (see comment to do_balance) */
-			 struct item_head * insert_key,  /* in our processing of one level we sometimes determine what
-							    must be inserted into the next higher level.  This insertion
-							    consists of a key or two keys and their corresponding
-							    pointers */
-			 struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */
+static int balance_leaf(struct tree_balance *tb, struct item_head *ih,	/* item header of inserted item (this is on little endian) */
+			const char *body,	/* body  of inserted item or bytes to paste */
+			int flag,	/* i - insert, d - delete, c - cut, p - paste
+					   (see comment to do_balance) */
+			struct item_head *insert_key,	/* in our processing of one level we sometimes determine what
+							   must be inserted into the next higher level.  This insertion
+							   consists of a key or two keys and their corresponding
+							   pointers */
+			struct buffer_head **insert_ptr	/* inserted node-ptrs for the next level */
     )
 {
-    struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path);
-    int item_pos = PATH_LAST_POSITION (tb->tb_path);	/*  index into the array of item headers in S[0] 
-							    of the affected item */
-    struct buffer_info bi;
-    struct buffer_head *S_new[2];  /* new nodes allocated to hold what could not fit into S */
-    int snum[2];	    /* number of items that will be placed
-                               into S_new (includes partially shifted
-                               items) */
-    int sbytes[2];          /* if an item is partially shifted into S_new then 
-			       if it is a directory item 
-			       it is the number of entries from the item that are shifted into S_new
-			       else
-			       it is the number of bytes from the item that are shifted into S_new
-			    */
-    int n, i;
-    int ret_val;
-    int pos_in_item;
-    int zeros_num;
-
-    PROC_INFO_INC( tb -> tb_sb, balance_at[ 0 ] );
-
-    /* Make balance in case insert_size[0] < 0 */
-    if ( tb->insert_size[0] < 0 )
-	return balance_leaf_when_delete (tb, flag);
-  
-    zeros_num = 0;
-    if (flag == M_INSERT && body == 0)
-	zeros_num = ih_item_len( ih );
-
-    pos_in_item = tb->tb_path->pos_in_item;
-    /* for indirect item pos_in_item is measured in unformatted node
-       pointers. Recalculate to bytes */
-    if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos)))
-	pos_in_item *= UNFM_P_SIZE;
-
-    if ( tb->lnum[0] > 0 ) {
-	/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
-	if ( item_pos < tb->lnum[0] ) {
-	    /* new item or it part falls to L[0], shift it too */
-	    n = B_NR_ITEMS(tb->L[0]);
-
-	    switch (flag) {
-	    case M_INSERT:   /* insert item into L[0] */
-
-		if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
-		    /* part of new item falls into L[0] */
-		    int new_item_len;
-		    int version;
-
-		    ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1);
-
-		    /* Calculate item length to insert to S[0] */
-		    new_item_len = ih_item_len(ih) - tb->lbytes;
-		    /* Calculate and check item length to insert to L[0] */
-		    put_ih_item_len(ih, ih_item_len(ih) - new_item_len );
-
-		    RFALSE( ih_item_len(ih) <= 0,
-			    "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
-                            ih_item_len(ih));
-
-		    /* Insert new item into L[0] */
-		    bi.tb = tb;
-		    bi.bi_bh = tb->L[0];
-		    bi.bi_parent = tb->FL[0];
-		    bi.bi_position = get_left_neighbor_position (tb, 0);
-		    leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body,
-					  zeros_num > ih_item_len(ih) ? ih_item_len(ih) : zeros_num);
-
-		    version = ih_version (ih);
-
-		    /* Calculate key component, item length and body to insert into S[0] */
-                    set_le_ih_k_offset( ih, le_ih_k_offset( ih ) + (tb->lbytes << (is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
-
-		    put_ih_item_len( ih, new_item_len );
-		    if ( tb->lbytes >  zeros_num ) {
-			body += (tb->lbytes - zeros_num);
-			zeros_num = 0;
-		    }
-		    else
-			zeros_num -= tb->lbytes;
-
-		    RFALSE( ih_item_len(ih) <= 0,
-			"PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
-			ih_item_len(ih));
-		} else {
-		    /* new item in whole falls into L[0] */
-		    /* Shift lnum[0]-1 items to L[0] */
-		    ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes);
-		    /* Insert new item into L[0] */
-		    bi.tb = tb;
-		    bi.bi_bh = tb->L[0];
-		    bi.bi_parent = tb->FL[0];
-		    bi.bi_position = get_left_neighbor_position (tb, 0);
-		    leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num);
-		    tb->insert_size[0] = 0;
-		    zeros_num = 0;
-		}
-		break;
-
-	    case M_PASTE:   /* append item in L[0] */
-
-		if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) {
-		    /* we must shift the part of the appended item */
-		    if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) {
-
-			RFALSE( zeros_num,
-				"PAP-12090: invalid parameter in case of a directory");
-			/* directory item */
-			if ( tb->lbytes > pos_in_item ) {
-			    /* new directory entry falls into L[0] */
-			    struct item_head * pasted;
-			    int l_pos_in_item = pos_in_item;
-							  
-			    /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
-			    ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1);
-			    if ( ret_val && ! item_pos ) {
-				pasted =  B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1);
-				l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1);
-			    }
-
-			    /* Append given directory entry to directory item */
-			    bi.tb = tb;
-			    bi.bi_bh = tb->L[0];
-			    bi.bi_parent = tb->FL[0];
-			    bi.bi_position = get_left_neighbor_position (tb, 0);
-			    leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item,
-						  tb->insert_size[0], body, zeros_num);
-
-			    /* previous string prepared space for pasting new entry, following string pastes this entry */
-
-			    /* when we have merge directory item, pos_in_item has been changed too */
-
-			    /* paste new directory entry. 1 is entry number */
-			    leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1,
-						(struct reiserfs_de_head *)body, 
-						body + DEH_SIZE, tb->insert_size[0]
-				);
-			    tb->insert_size[0] = 0;
-			} else {
-			    /* new directory item doesn't fall into L[0] */
-			    /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
-			    leaf_shift_left (tb, tb->lnum[0], tb->lbytes);
-			}
-			/* Calculate new position to append in item body */
-			pos_in_item -= tb->lbytes;
-		    }
-		    else {
-			/* regular object */
-			RFALSE( tb->lbytes <= 0,
-			        "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
-				tb->lbytes);
-			RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0, item_pos)),
-                                "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
-				ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)), pos_in_item);
-
-			if ( tb->lbytes >= pos_in_item ) {
-			    /* appended item will be in L[0] in whole */
-			    int l_n;
-
-			    /* this bytes number must be appended to the last item of L[h] */
-			    l_n = tb->lbytes - pos_in_item;
-
-			    /* Calculate new insert_size[0] */
-			    tb->insert_size[0] -= l_n;
-
-			    RFALSE( tb->insert_size[0] <= 0,
-				    "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
-				    tb->insert_size[0]);
-			    ret_val =  leaf_shift_left(tb,tb->lnum[0], 
-						       ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)));
-			    /* Append to body of item in L[0] */
-			    bi.tb = tb;
-			    bi.bi_bh = tb->L[0];
-			    bi.bi_parent = tb->FL[0];
-			    bi.bi_position = get_left_neighbor_position (tb, 0);
-			    leaf_paste_in_buffer(
-				&bi,n + item_pos - ret_val,
-				ih_item_len( B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)),
-				l_n,body, zeros_num > l_n ? l_n : zeros_num
-				);
-			    /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/
-			    {
-				int version;
-				int temp_l = l_n;
-				
-				RFALSE (ih_item_len (B_N_PITEM_HEAD (tbS0, 0)),
-					"PAP-12106: item length must be 0");
-				RFALSE (comp_short_le_keys (B_N_PKEY (tbS0, 0),
-							    B_N_PKEY (tb->L[0],
-									    n + item_pos - ret_val)),
-					"PAP-12107: items must be of the same file");
-				if (is_indirect_le_ih(B_N_PITEM_HEAD (tb->L[0],
-								      n + item_pos - ret_val)))	{
-				    temp_l = l_n << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT);
+	struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int item_pos = PATH_LAST_POSITION(tb->tb_path);	/*  index into the array of item headers in S[0] 
+							   of the affected item */
+	struct buffer_info bi;
+	struct buffer_head *S_new[2];	/* new nodes allocated to hold what could not fit into S */
+	int snum[2];		/* number of items that will be placed
+				   into S_new (includes partially shifted
+				   items) */
+	int sbytes[2];		/* if an item is partially shifted into S_new then 
+				   if it is a directory item 
+				   it is the number of entries from the item that are shifted into S_new
+				   else
+				   it is the number of bytes from the item that are shifted into S_new
+				 */
+	int n, i;
+	int ret_val;
+	int pos_in_item;
+	int zeros_num;
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[0]);
+
+	/* Make balance in case insert_size[0] < 0 */
+	if (tb->insert_size[0] < 0)
+		return balance_leaf_when_delete(tb, flag);
+
+	zeros_num = 0;
+	if (flag == M_INSERT && body == 0)
+		zeros_num = ih_item_len(ih);
+
+	pos_in_item = tb->tb_path->pos_in_item;
+	/* for indirect item pos_in_item is measured in unformatted node
+	   pointers. Recalculate to bytes */
+	if (flag != M_INSERT
+	    && is_indirect_le_ih(B_N_PITEM_HEAD(tbS0, item_pos)))
+		pos_in_item *= UNFM_P_SIZE;
+
+	if (tb->lnum[0] > 0) {
+		/* Shift lnum[0] items from S[0] to the left neighbor L[0] */
+		if (item_pos < tb->lnum[0]) {
+			/* new item or it part falls to L[0], shift it too */
+			n = B_NR_ITEMS(tb->L[0]);
+
+			switch (flag) {
+			case M_INSERT:	/* insert item into L[0] */
+
+				if (item_pos == tb->lnum[0] - 1
+				    && tb->lbytes != -1) {
+					/* part of new item falls into L[0] */
+					int new_item_len;
+					int version;
+
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0] - 1,
+							    -1);
+
+					/* Calculate item length to insert to S[0] */
+					new_item_len =
+					    ih_item_len(ih) - tb->lbytes;
+					/* Calculate and check item length to insert to L[0] */
+					put_ih_item_len(ih,
+							ih_item_len(ih) -
+							new_item_len);
+
+					RFALSE(ih_item_len(ih) <= 0,
+					       "PAP-12080: there is nothing to insert into L[0]: ih_item_len=%d",
+					       ih_item_len(ih));
+
+					/* Insert new item into L[0] */
+					bi.tb = tb;
+					bi.bi_bh = tb->L[0];
+					bi.bi_parent = tb->FL[0];
+					bi.bi_position =
+					    get_left_neighbor_position(tb, 0);
+					leaf_insert_into_buf(&bi,
+							     n + item_pos -
+							     ret_val, ih, body,
+							     zeros_num >
+							     ih_item_len(ih) ?
+							     ih_item_len(ih) :
+							     zeros_num);
+
+					version = ih_version(ih);
+
+					/* Calculate key component, item length and body to insert into S[0] */
+					set_le_ih_k_offset(ih,
+							   le_ih_k_offset(ih) +
+							   (tb->
+							    lbytes <<
+							    (is_indirect_le_ih
+							     (ih) ? tb->tb_sb->
+							     s_blocksize_bits -
+							     UNFM_P_SHIFT :
+							     0)));
+
+					put_ih_item_len(ih, new_item_len);
+					if (tb->lbytes > zeros_num) {
+						body +=
+						    (tb->lbytes - zeros_num);
+						zeros_num = 0;
+					} else
+						zeros_num -= tb->lbytes;
+
+					RFALSE(ih_item_len(ih) <= 0,
+					       "PAP-12085: there is nothing to insert into S[0]: ih_item_len=%d",
+					       ih_item_len(ih));
+				} else {
+					/* new item in whole falls into L[0] */
+					/* Shift lnum[0]-1 items to L[0] */
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0] - 1,
+							    tb->lbytes);
+					/* Insert new item into L[0] */
+					bi.tb = tb;
+					bi.bi_bh = tb->L[0];
+					bi.bi_parent = tb->FL[0];
+					bi.bi_position =
+					    get_left_neighbor_position(tb, 0);
+					leaf_insert_into_buf(&bi,
+							     n + item_pos -
+							     ret_val, ih, body,
+							     zeros_num);
+					tb->insert_size[0] = 0;
+					zeros_num = 0;
 				}
-				/* update key of first item in S0 */
-				version = ih_version (B_N_PITEM_HEAD (tbS0, 0));
-				set_le_key_k_offset (version, B_N_PKEY (tbS0, 0), 
-						     le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + temp_l);
-				/* update left delimiting key */
-				set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]),
-						     le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + temp_l);
-			    }
-
-			    /* Calculate new body, position in item and insert_size[0] */
-			    if ( l_n > zeros_num ) {
-				body += (l_n - zeros_num);
-				zeros_num = 0;
-			    }
-			    else
-				zeros_num -= l_n;
-			    pos_in_item = 0;	
-
-			    RFALSE( comp_short_le_keys 
-				    (B_N_PKEY(tbS0,0),
-				     B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) ||
-				
-				    !op_is_left_mergeable 
-				    (B_N_PKEY (tbS0, 0), tbS0->b_size) ||
-				    !op_is_left_mergeable
-				    (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), 
-				     tbS0->b_size),
-				    "PAP-12120: item must be merge-able with left neighboring item");
-			}
-			else /* only part of the appended item will be in L[0] */
-			{
-			    /* Calculate position in item for append in S[0] */
-			    pos_in_item -= tb->lbytes;
-
-			    RFALSE( pos_in_item <= 0,
-				    "PAP-12125: no place for paste. pos_in_item=%d", pos_in_item);
-
-			    /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
-			    leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
-			}
-		    }
-		}
-		else /* appended item will be in L[0] in whole */
-		{
-		    struct item_head * pasted;
-
-			if ( ! item_pos  && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) )
-			{ /* if we paste into first item of S[0] and it is left mergable */
-			    /* then increment pos_in_item by the size of the last item in L[0] */
-			    pasted = B_N_PITEM_HEAD(tb->L[0],n-1);
-			    if ( is_direntry_le_ih (pasted) )
-				pos_in_item += ih_entry_count(pasted);
-			    else
-				pos_in_item += ih_item_len(pasted);
+				break;
+
+			case M_PASTE:	/* append item in L[0] */
+
+				if (item_pos == tb->lnum[0] - 1
+				    && tb->lbytes != -1) {
+					/* we must shift the part of the appended item */
+					if (is_direntry_le_ih
+					    (B_N_PITEM_HEAD(tbS0, item_pos))) {
+
+						RFALSE(zeros_num,
+						       "PAP-12090: invalid parameter in case of a directory");
+						/* directory item */
+						if (tb->lbytes > pos_in_item) {
+							/* new directory entry falls into L[0] */
+							struct item_head
+							    *pasted;
+							int l_pos_in_item =
+							    pos_in_item;
+
+							/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */
+							ret_val =
+							    leaf_shift_left(tb,
+									    tb->
+									    lnum
+									    [0],
+									    tb->
+									    lbytes
+									    -
+									    1);
+							if (ret_val
+							    && !item_pos) {
+								pasted =
+								    B_N_PITEM_HEAD
+								    (tb->L[0],
+								     B_NR_ITEMS
+								     (tb->
+								      L[0]) -
+								     1);
+								l_pos_in_item +=
+								    I_ENTRY_COUNT
+								    (pasted) -
+								    (tb->
+								     lbytes -
+								     1);
+							}
+
+							/* Append given directory entry to directory item */
+							bi.tb = tb;
+							bi.bi_bh = tb->L[0];
+							bi.bi_parent =
+							    tb->FL[0];
+							bi.bi_position =
+							    get_left_neighbor_position
+							    (tb, 0);
+							leaf_paste_in_buffer
+							    (&bi,
+							     n + item_pos -
+							     ret_val,
+							     l_pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+							/* previous string prepared space for pasting new entry, following string pastes this entry */
+
+							/* when we have merge directory item, pos_in_item has been changed too */
+
+							/* paste new directory entry. 1 is entry number */
+							leaf_paste_entries(bi.
+									   bi_bh,
+									   n +
+									   item_pos
+									   -
+									   ret_val,
+									   l_pos_in_item,
+									   1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+							tb->insert_size[0] = 0;
+						} else {
+							/* new directory item doesn't fall into L[0] */
+							/* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */
+							leaf_shift_left(tb,
+									tb->
+									lnum[0],
+									tb->
+									lbytes);
+						}
+						/* Calculate new position to append in item body */
+						pos_in_item -= tb->lbytes;
+					} else {
+						/* regular object */
+						RFALSE(tb->lbytes <= 0,
+						       "PAP-12095: there is nothing to shift to L[0]. lbytes=%d",
+						       tb->lbytes);
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       "PAP-12100: incorrect position to paste: item_len=%d, pos_in_item=%d",
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       pos_in_item);
+
+						if (tb->lbytes >= pos_in_item) {
+							/* appended item will be in L[0] in whole */
+							int l_n;
+
+							/* this bytes number must be appended to the last item of L[h] */
+							l_n =
+							    tb->lbytes -
+							    pos_in_item;
+
+							/* Calculate new insert_size[0] */
+							tb->insert_size[0] -=
+							    l_n;
+
+							RFALSE(tb->
+							       insert_size[0] <=
+							       0,
+							       "PAP-12105: there is nothing to paste into L[0]. insert_size=%d",
+							       tb->
+							       insert_size[0]);
+							ret_val =
+							    leaf_shift_left(tb,
+									    tb->
+									    lnum
+									    [0],
+									    ih_item_len
+									    (B_N_PITEM_HEAD
+									     (tbS0,
+									      item_pos)));
+							/* Append to body of item in L[0] */
+							bi.tb = tb;
+							bi.bi_bh = tb->L[0];
+							bi.bi_parent =
+							    tb->FL[0];
+							bi.bi_position =
+							    get_left_neighbor_position
+							    (tb, 0);
+							leaf_paste_in_buffer
+							    (&bi,
+							     n + item_pos -
+							     ret_val,
+							     ih_item_len
+							     (B_N_PITEM_HEAD
+							      (tb->L[0],
+							       n + item_pos -
+							       ret_val)), l_n,
+							     body,
+							     zeros_num >
+							     l_n ? l_n :
+							     zeros_num);
+							/* 0-th item in S0 can be only of DIRECT type when l_n != 0 */
+							{
+								int version;
+								int temp_l =
+								    l_n;
+
+								RFALSE
+								    (ih_item_len
+								     (B_N_PITEM_HEAD
+								      (tbS0,
+								       0)),
+								     "PAP-12106: item length must be 0");
+								RFALSE
+								    (comp_short_le_keys
+								     (B_N_PKEY
+								      (tbS0, 0),
+								      B_N_PKEY
+								      (tb->L[0],
+								       n +
+								       item_pos
+								       -
+								       ret_val)),
+								     "PAP-12107: items must be of the same file");
+								if (is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], n + item_pos - ret_val))) {
+									temp_l =
+									    l_n
+									    <<
+									    (tb->
+									     tb_sb->
+									     s_blocksize_bits
+									     -
+									     UNFM_P_SHIFT);
+								}
+								/* update key of first item in S0 */
+								version =
+								    ih_version
+								    (B_N_PITEM_HEAD
+								     (tbS0, 0));
+								set_le_key_k_offset
+								    (version,
+								     B_N_PKEY
+								     (tbS0, 0),
+								     le_key_k_offset
+								     (version,
+								      B_N_PKEY
+								      (tbS0,
+								       0)) +
+								     temp_l);
+								/* update left delimiting key */
+								set_le_key_k_offset
+								    (version,
+								     B_N_PDELIM_KEY
+								     (tb->
+								      CFL[0],
+								      tb->
+								      lkey[0]),
+								     le_key_k_offset
+								     (version,
+								      B_N_PDELIM_KEY
+								      (tb->
+								       CFL[0],
+								       tb->
+								       lkey[0]))
+								     + temp_l);
+							}
+
+							/* Calculate new body, position in item and insert_size[0] */
+							if (l_n > zeros_num) {
+								body +=
+								    (l_n -
+								     zeros_num);
+								zeros_num = 0;
+							} else
+								zeros_num -=
+								    l_n;
+							pos_in_item = 0;
+
+							RFALSE
+							    (comp_short_le_keys
+							     (B_N_PKEY(tbS0, 0),
+							      B_N_PKEY(tb->L[0],
+								       B_NR_ITEMS
+								       (tb->
+									L[0]) -
+								       1))
+							     ||
+							     !op_is_left_mergeable
+							     (B_N_PKEY(tbS0, 0),
+							      tbS0->b_size)
+							     ||
+							     !op_is_left_mergeable
+							     (B_N_PDELIM_KEY
+							      (tb->CFL[0],
+							       tb->lkey[0]),
+							      tbS0->b_size),
+							     "PAP-12120: item must be merge-able with left neighboring item");
+						} else {	/* only part of the appended item will be in L[0] */
+
+							/* Calculate position in item for append in S[0] */
+							pos_in_item -=
+							    tb->lbytes;
+
+							RFALSE(pos_in_item <= 0,
+							       "PAP-12125: no place for paste. pos_in_item=%d",
+							       pos_in_item);
+
+							/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+							leaf_shift_left(tb,
+									tb->
+									lnum[0],
+									tb->
+									lbytes);
+						}
+					}
+				} else {	/* appended item will be in L[0] in whole */
+
+					struct item_head *pasted;
+
+					if (!item_pos && op_is_left_mergeable(B_N_PKEY(tbS0, 0), tbS0->b_size)) {	/* if we paste into first item of S[0] and it is left mergable */
+						/* then increment pos_in_item by the size of the last item in L[0] */
+						pasted =
+						    B_N_PITEM_HEAD(tb->L[0],
+								   n - 1);
+						if (is_direntry_le_ih(pasted))
+							pos_in_item +=
+							    ih_entry_count
+							    (pasted);
+						else
+							pos_in_item +=
+							    ih_item_len(pasted);
+					}
+
+					/* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
+					ret_val =
+					    leaf_shift_left(tb, tb->lnum[0],
+							    tb->lbytes);
+					/* Append to body of item in L[0] */
+					bi.tb = tb;
+					bi.bi_bh = tb->L[0];
+					bi.bi_parent = tb->FL[0];
+					bi.bi_position =
+					    get_left_neighbor_position(tb, 0);
+					leaf_paste_in_buffer(&bi,
+							     n + item_pos -
+							     ret_val,
+							     pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+					/* if appended item is directory, paste entry */
+					pasted =
+					    B_N_PITEM_HEAD(tb->L[0],
+							   n + item_pos -
+							   ret_val);
+					if (is_direntry_le_ih(pasted))
+						leaf_paste_entries(bi.bi_bh,
+								   n +
+								   item_pos -
+								   ret_val,
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+					/* if appended item is indirect item, put unformatted node into un list */
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					tb->insert_size[0] = 0;
+					zeros_num = 0;
+				}
+				break;
+			default:	/* cases d and t */
+				reiserfs_panic(tb->tb_sb,
+					       "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
+					       (flag ==
+						M_DELETE) ? "DELETE" : ((flag ==
+									 M_CUT)
+									? "CUT"
+									:
+									"UNKNOWN"),
+					       flag);
 			}
-
-		    /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */
-		    ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
-		    /* Append to body of item in L[0] */
-		    bi.tb = tb;
-		    bi.bi_bh = tb->L[0];
-		    bi.bi_parent = tb->FL[0];
-		    bi.bi_position = get_left_neighbor_position (tb, 0);
-		    leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0],
-					  body, zeros_num);
-
-		    /* if appended item is directory, paste entry */
-		    pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val);
-		    if (is_direntry_le_ih (pasted))
-			leaf_paste_entries (
-			    bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1, 
-			    (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
-			    );
-		    /* if appended item is indirect item, put unformatted node into un list */
-		    if (is_indirect_le_ih (pasted))
-			set_ih_free_space (pasted, 0);
-		    tb->insert_size[0] = 0;
-		    zeros_num = 0;
+		} else {
+			/* new item doesn't fall into L[0] */
+			leaf_shift_left(tb, tb->lnum[0], tb->lbytes);
 		}
-		break;
-	    default:    /* cases d and t */
-		reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)",
-				(flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
-	    }
-	} else { 
-	    /* new item doesn't fall into L[0] */
-	    leaf_shift_left(tb,tb->lnum[0],tb->lbytes);
 	}
-    }	/* tb->lnum[0] > 0 */
 
-    /* Calculate new item position */
-    item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0));
-
-    if ( tb->rnum[0] > 0 ) {
-	/* shift rnum[0] items from S[0] to the right neighbor R[0] */
-	n = B_NR_ITEMS(tbS0);
-	switch ( flag ) {
-
-	case M_INSERT:   /* insert item */
-	    if ( n - tb->rnum[0] < item_pos )
-	    { /* new item or its part falls to R[0] */
-		if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 )
-		{ /* part of new item falls into R[0] */
-		    loff_t old_key_comp, old_len, r_zeros_number;
-		    const char * r_body;
-		    int version;
-		    loff_t offset;
-
-		    leaf_shift_right(tb,tb->rnum[0]-1,-1);
-
-		    version = ih_version(ih);
-		    /* Remember key component and item length */
-                    old_key_comp = le_ih_k_offset( ih );
-		    old_len = ih_item_len(ih);
-
-		    /* Calculate key component and item length to insert into R[0] */
-                    offset = le_ih_k_offset( ih ) + ((old_len - tb->rbytes )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0));
-                    set_le_ih_k_offset( ih, offset );
-		    put_ih_item_len( ih, tb->rbytes);
-		    /* Insert part of the item into R[0] */
-		    bi.tb = tb;
-		    bi.bi_bh = tb->R[0];
-		    bi.bi_parent = tb->FR[0];
-		    bi.bi_position = get_right_neighbor_position (tb, 0);
-		    if ( (old_len - tb->rbytes) > zeros_num ) {
-			r_zeros_number = 0;
-			r_body = body + (old_len - tb->rbytes) - zeros_num;
-		    }
-		    else {
-			r_body = body;
-			r_zeros_number = zeros_num - (old_len - tb->rbytes);
-			zeros_num -= r_zeros_number;
-		    }
-
-		    leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
-
-		    /* Replace right delimiting key by first key in R[0] */
-		    replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
-
-		    /* Calculate key component and item length to insert into S[0] */
-                    set_le_ih_k_offset( ih, old_key_comp );
-		    put_ih_item_len( ih, old_len - tb->rbytes );
-
-		    tb->insert_size[0] -= tb->rbytes;
+	/* tb->lnum[0] > 0 */
+	/* Calculate new item position */
+	item_pos -= (tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0));
+
+	if (tb->rnum[0] > 0) {
+		/* shift rnum[0] items from S[0] to the right neighbor R[0] */
+		n = B_NR_ITEMS(tbS0);
+		switch (flag) {
+
+		case M_INSERT:	/* insert item */
+			if (n - tb->rnum[0] < item_pos) {	/* new item or its part falls to R[0] */
+				if (item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) {	/* part of new item falls into R[0] */
+					loff_t old_key_comp, old_len,
+					    r_zeros_number;
+					const char *r_body;
+					int version;
+					loff_t offset;
+
+					leaf_shift_right(tb, tb->rnum[0] - 1,
+							 -1);
+
+					version = ih_version(ih);
+					/* Remember key component and item length */
+					old_key_comp = le_ih_k_offset(ih);
+					old_len = ih_item_len(ih);
+
+					/* Calculate key component and item length to insert into R[0] */
+					offset =
+					    le_ih_k_offset(ih) +
+					    ((old_len -
+					      tb->
+					      rbytes) << (is_indirect_le_ih(ih)
+							  ? tb->tb_sb->
+							  s_blocksize_bits -
+							  UNFM_P_SHIFT : 0));
+					set_le_ih_k_offset(ih, offset);
+					put_ih_item_len(ih, tb->rbytes);
+					/* Insert part of the item into R[0] */
+					bi.tb = tb;
+					bi.bi_bh = tb->R[0];
+					bi.bi_parent = tb->FR[0];
+					bi.bi_position =
+					    get_right_neighbor_position(tb, 0);
+					if ((old_len - tb->rbytes) > zeros_num) {
+						r_zeros_number = 0;
+						r_body =
+						    body + (old_len -
+							    tb->rbytes) -
+						    zeros_num;
+					} else {
+						r_body = body;
+						r_zeros_number =
+						    zeros_num - (old_len -
+								 tb->rbytes);
+						zeros_num -= r_zeros_number;
+					}
+
+					leaf_insert_into_buf(&bi, 0, ih, r_body,
+							     r_zeros_number);
+
+					/* Replace right delimiting key by first key in R[0] */
+					replace_key(tb, tb->CFR[0], tb->rkey[0],
+						    tb->R[0], 0);
+
+					/* Calculate key component and item length to insert into S[0] */
+					set_le_ih_k_offset(ih, old_key_comp);
+					put_ih_item_len(ih,
+							old_len - tb->rbytes);
+
+					tb->insert_size[0] -= tb->rbytes;
+
+				} else {	/* whole new item falls into R[0] */
+
+					/* Shift rnum[0]-1 items to R[0] */
+					ret_val =
+					    leaf_shift_right(tb,
+							     tb->rnum[0] - 1,
+							     tb->rbytes);
+					/* Insert new item into R[0] */
+					bi.tb = tb;
+					bi.bi_bh = tb->R[0];
+					bi.bi_parent = tb->FR[0];
+					bi.bi_position =
+					    get_right_neighbor_position(tb, 0);
+					leaf_insert_into_buf(&bi,
+							     item_pos - n +
+							     tb->rnum[0] - 1,
+							     ih, body,
+							     zeros_num);
+
+					if (item_pos - n + tb->rnum[0] - 1 == 0) {
+						replace_key(tb, tb->CFR[0],
+							    tb->rkey[0],
+							    tb->R[0], 0);
+
+					}
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			} else {	/* new item or part of it doesn't fall into R[0] */
 
-		}
-		else /* whole new item falls into R[0] */
-		{					  
-		    /* Shift rnum[0]-1 items to R[0] */
-		    ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes);
-		    /* Insert new item into R[0] */
-		    bi.tb = tb;
-		    bi.bi_bh = tb->R[0];
-		    bi.bi_parent = tb->FR[0];
-		    bi.bi_position = get_right_neighbor_position (tb, 0);
-		    leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num);
-
-		    if ( item_pos - n + tb->rnum[0] - 1 == 0 ) {
-			replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
-
-		    }
-		    zeros_num = tb->insert_size[0] = 0;
-		}
-	    }
-	    else /* new item or part of it doesn't fall into R[0] */
-	    {
-		leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
-	    }
-	    break;
-
-	case M_PASTE:   /* append item */
-
-	    if ( n - tb->rnum[0] <= item_pos )  /* pasted item or part of it falls to R[0] */
-	    {
-		if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 )
-		{ /* we must shift the part of the appended item */
-		    if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos)))
-		    { /* we append to directory item */
-			int entry_count;
-
-			RFALSE( zeros_num,
-				"PAP-12145: invalid parameter in case of a directory");
-			entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos));
-			if ( entry_count - tb->rbytes < pos_in_item )
-			    /* new directory entry falls into R[0] */
-			{
-			    int paste_entry_position;
-
-			    RFALSE( tb->rbytes - 1 >= entry_count || 
-				    ! tb->insert_size[0],
-				    "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
-				    tb->rbytes, entry_count);
-			    /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
-			    leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1);
-			    /* Paste given directory entry to directory item */
-			    paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1;
-			    bi.tb = tb;
-			    bi.bi_bh = tb->R[0];
-			    bi.bi_parent = tb->FR[0];
-			    bi.bi_position = get_right_neighbor_position (tb, 0);
-			    leaf_paste_in_buffer (&bi, 0, paste_entry_position,
-						  tb->insert_size[0],body,zeros_num);
-			    /* paste entry */
-			    leaf_paste_entries (
-				bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body, 
-				body + DEH_SIZE, tb->insert_size[0]
-				);								
-						
-			    if ( paste_entry_position == 0 ) {
-				/* change delimiting keys */
-				replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
-			    }
-
-			    tb->insert_size[0] = 0;
-			    pos_in_item++;
-			}
-			else /* new directory entry doesn't fall into R[0] */
-			{
-			    leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
-			}
-		    }
-		    else /* regular object */
-		    {
-			int n_shift, n_rem, r_zeros_number;
-			const char * r_body;
-
-			/* Calculate number of bytes which must be shifted from appended item */
-			if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 )
-			    n_shift = 0;
-
-			RFALSE(pos_in_item != ih_item_len(B_N_PITEM_HEAD (tbS0, item_pos)),
-			       "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
-                               pos_in_item, ih_item_len( B_N_PITEM_HEAD(tbS0,item_pos)));
-
-			leaf_shift_right(tb,tb->rnum[0],n_shift);
-			/* Calculate number of bytes which must remain in body after appending to R[0] */
-			if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 )
-			    n_rem = 0;
-			
-			{
-			  int version;
-			  unsigned long temp_rem = n_rem;
-			  
-			  version = ih_version (B_N_PITEM_HEAD (tb->R[0],0));
-			  if (is_indirect_le_key(version,B_N_PKEY(tb->R[0],0))){
-			      temp_rem = n_rem << (tb->tb_sb->s_blocksize_bits -
-					 UNFM_P_SHIFT);
-			  }
-			  set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0), 
-					       le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + temp_rem);
-			  set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]), 
-					       le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + temp_rem);
+				leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
 			}
+			break;
+
+		case M_PASTE:	/* append item */
+
+			if (n - tb->rnum[0] <= item_pos) {	/* pasted item or part of it falls to R[0] */
+				if (item_pos == n - tb->rnum[0] && tb->rbytes != -1) {	/* we must shift the part of the appended item */
+					if (is_direntry_le_ih(B_N_PITEM_HEAD(tbS0, item_pos))) {	/* we append to directory item */
+						int entry_count;
+
+						RFALSE(zeros_num,
+						       "PAP-12145: invalid parameter in case of a directory");
+						entry_count =
+						    I_ENTRY_COUNT(B_N_PITEM_HEAD
+								  (tbS0,
+								   item_pos));
+						if (entry_count - tb->rbytes <
+						    pos_in_item)
+							/* new directory entry falls into R[0] */
+						{
+							int paste_entry_position;
+
+							RFALSE(tb->rbytes - 1 >=
+							       entry_count
+							       || !tb->
+							       insert_size[0],
+							       "PAP-12150: no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d",
+							       tb->rbytes,
+							       entry_count);
+							/* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */
+							leaf_shift_right(tb,
+									 tb->
+									 rnum
+									 [0],
+									 tb->
+									 rbytes
+									 - 1);
+							/* Paste given directory entry to directory item */
+							paste_entry_position =
+							    pos_in_item -
+							    entry_count +
+							    tb->rbytes - 1;
+							bi.tb = tb;
+							bi.bi_bh = tb->R[0];
+							bi.bi_parent =
+							    tb->FR[0];
+							bi.bi_position =
+							    get_right_neighbor_position
+							    (tb, 0);
+							leaf_paste_in_buffer
+							    (&bi, 0,
+							     paste_entry_position,
+							     tb->insert_size[0],
+							     body, zeros_num);
+							/* paste entry */
+							leaf_paste_entries(bi.
+									   bi_bh,
+									   0,
+									   paste_entry_position,
+									   1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+
+							if (paste_entry_position
+							    == 0) {
+								/* change delimiting keys */
+								replace_key(tb,
+									    tb->
+									    CFR
+									    [0],
+									    tb->
+									    rkey
+									    [0],
+									    tb->
+									    R
+									    [0],
+									    0);
+							}
+
+							tb->insert_size[0] = 0;
+							pos_in_item++;
+						} else {	/* new directory entry doesn't fall into R[0] */
+
+							leaf_shift_right(tb,
+									 tb->
+									 rnum
+									 [0],
+									 tb->
+									 rbytes);
+						}
+					} else {	/* regular object */
+
+						int n_shift, n_rem,
+						    r_zeros_number;
+						const char *r_body;
+
+						/* Calculate number of bytes which must be shifted from appended item */
+						if ((n_shift =
+						     tb->rbytes -
+						     tb->insert_size[0]) < 0)
+							n_shift = 0;
+
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)),
+						       "PAP-12155: invalid position to paste. ih_item_len=%d, pos_in_item=%d",
+						       pos_in_item,
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos)));
+
+						leaf_shift_right(tb,
+								 tb->rnum[0],
+								 n_shift);
+						/* Calculate number of bytes which must remain in body after appending to R[0] */
+						if ((n_rem =
+						     tb->insert_size[0] -
+						     tb->rbytes) < 0)
+							n_rem = 0;
+
+						{
+							int version;
+							unsigned long temp_rem =
+							    n_rem;
+
+							version =
+							    ih_version
+							    (B_N_PITEM_HEAD
+							     (tb->R[0], 0));
+							if (is_indirect_le_key
+							    (version,
+							     B_N_PKEY(tb->R[0],
+								      0))) {
+								temp_rem =
+								    n_rem <<
+								    (tb->tb_sb->
+								     s_blocksize_bits
+								     -
+								     UNFM_P_SHIFT);
+							}
+							set_le_key_k_offset
+							    (version,
+							     B_N_PKEY(tb->R[0],
+								      0),
+							     le_key_k_offset
+							     (version,
+							      B_N_PKEY(tb->R[0],
+								       0)) +
+							     temp_rem);
+							set_le_key_k_offset
+							    (version,
+							     B_N_PDELIM_KEY(tb->
+									    CFR
+									    [0],
+									    tb->
+									    rkey
+									    [0]),
+							     le_key_k_offset
+							     (version,
+							      B_N_PDELIM_KEY
+							      (tb->CFR[0],
+							       tb->rkey[0])) +
+							     temp_rem);
+						}
 /*		  k_offset (B_N_PKEY(tb->R[0],0)) += n_rem;
 		  k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/
-			do_balance_mark_internal_dirty (tb, tb->CFR[0], 0);
-
-			/* Append part of body into R[0] */
-			bi.tb = tb;
-			bi.bi_bh = tb->R[0];
-			bi.bi_parent = tb->FR[0];
-			bi.bi_position = get_right_neighbor_position (tb, 0);
-			if ( n_rem > zeros_num ) {
-			    r_zeros_number = 0;
-			    r_body = body + n_rem - zeros_num;
-			}
-			else {
-			    r_body = body;
-			    r_zeros_number = zeros_num - n_rem;
-			    zeros_num -= r_zeros_number;
-			}
-
-			leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number);
-
-			if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) {
+						do_balance_mark_internal_dirty
+						    (tb, tb->CFR[0], 0);
+
+						/* Append part of body into R[0] */
+						bi.tb = tb;
+						bi.bi_bh = tb->R[0];
+						bi.bi_parent = tb->FR[0];
+						bi.bi_position =
+						    get_right_neighbor_position
+						    (tb, 0);
+						if (n_rem > zeros_num) {
+							r_zeros_number = 0;
+							r_body =
+							    body + n_rem -
+							    zeros_num;
+						} else {
+							r_body = body;
+							r_zeros_number =
+							    zeros_num - n_rem;
+							zeros_num -=
+							    r_zeros_number;
+						}
+
+						leaf_paste_in_buffer(&bi, 0,
+								     n_shift,
+								     tb->
+								     insert_size
+								     [0] -
+								     n_rem,
+								     r_body,
+								     r_zeros_number);
+
+						if (is_indirect_le_ih
+						    (B_N_PITEM_HEAD
+						     (tb->R[0], 0))) {
 #if 0
-			    RFALSE( n_rem,
-				    "PAP-12160: paste more than one unformatted node pointer");
+							RFALSE(n_rem,
+							       "PAP-12160: paste more than one unformatted node pointer");
 #endif
-			    set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), 0);
-			}
-			tb->insert_size[0] = n_rem;
-			if ( ! n_rem )
-			    pos_in_item ++;
-		    }
-		}
-		else /* pasted item in whole falls into R[0] */
-		{
-		    struct item_head * pasted;
+							set_ih_free_space
+							    (B_N_PITEM_HEAD
+							     (tb->R[0], 0), 0);
+						}
+						tb->insert_size[0] = n_rem;
+						if (!n_rem)
+							pos_in_item++;
+					}
+				} else {	/* pasted item in whole falls into R[0] */
+
+					struct item_head *pasted;
+
+					ret_val =
+					    leaf_shift_right(tb, tb->rnum[0],
+							     tb->rbytes);
+					/* append item in R[0] */
+					if (pos_in_item >= 0) {
+						bi.tb = tb;
+						bi.bi_bh = tb->R[0];
+						bi.bi_parent = tb->FR[0];
+						bi.bi_position =
+						    get_right_neighbor_position
+						    (tb, 0);
+						leaf_paste_in_buffer(&bi,
+								     item_pos -
+								     n +
+								     tb->
+								     rnum[0],
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+					}
+
+					/* paste new entry, if item is directory item */
+					pasted =
+					    B_N_PITEM_HEAD(tb->R[0],
+							   item_pos - n +
+							   tb->rnum[0]);
+					if (is_direntry_le_ih(pasted)
+					    && pos_in_item >= 0) {
+						leaf_paste_entries(bi.bi_bh,
+								   item_pos -
+								   n +
+								   tb->rnum[0],
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+						if (!pos_in_item) {
+
+							RFALSE(item_pos - n +
+							       tb->rnum[0],
+							       "PAP-12165: directory item must be first item of node when pasting is in 0th position");
+
+							/* update delimiting keys */
+							replace_key(tb,
+								    tb->CFR[0],
+								    tb->rkey[0],
+								    tb->R[0],
+								    0);
+						}
+					}
+
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			} else {	/* new item doesn't fall into R[0] */
 
-		    ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
-		    /* append item in R[0] */
-		    if ( pos_in_item >= 0 ) {
-			bi.tb = tb;
-			bi.bi_bh = tb->R[0];
-			bi.bi_parent = tb->FR[0];
-			bi.bi_position = get_right_neighbor_position (tb, 0);
-			leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item,
-					     tb->insert_size[0],body, zeros_num);
-		    }
-
-		    /* paste new entry, if item is directory item */
-		    pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]);
-		    if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) {
-			leaf_paste_entries (
-			    bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1, 
-			    (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
-			    );
-			if ( ! pos_in_item ) {
-
-			    RFALSE( item_pos - n + tb->rnum[0],
-				    "PAP-12165: directory item must be first item of node when pasting is in 0th position");
-
-			    /* update delimiting keys */
-			    replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0);
+				leaf_shift_right(tb, tb->rnum[0], tb->rbytes);
 			}
-		    }
-
-		    if (is_indirect_le_ih (pasted))
-			set_ih_free_space (pasted, 0);
-		    zeros_num = tb->insert_size[0] = 0;
+			break;
+		default:	/* cases d and t */
+			reiserfs_panic(tb->tb_sb,
+				       "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
+				       (flag ==
+					M_DELETE) ? "DELETE" : ((flag ==
+								 M_CUT) ? "CUT"
+								: "UNKNOWN"),
+				       flag);
 		}
-	    }
-	    else /* new item doesn't fall into R[0] */
-	    {
-		leaf_shift_right(tb,tb->rnum[0],tb->rbytes);
-	    }
-	    break;
-	default:    /* cases d and t */
-	    reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)",
-			    (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
-	}
-    
-    }	/* tb->rnum[0] > 0 */
-
-
-    RFALSE( tb->blknum[0] > 3,
-	    "PAP-12180: blknum can not be %d. It must be <= 3",  tb->blknum[0]);
-    RFALSE( tb->blknum[0] < 0,
-	    "PAP-12185: blknum can not be %d. It must be >= 0",  tb->blknum[0]);
-
-    /* if while adding to a node we discover that it is possible to split
-       it in two, and merge the left part into the left neighbor and the
-       right part into the right neighbor, eliminating the node */
-    if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */
-
-	RFALSE( ! tb->lnum[0] || ! tb->rnum[0],
-	        "PAP-12190: lnum and rnum must not be zero");
-	/* if insertion was done before 0-th position in R[0], right
-	   delimiting key of the tb->L[0]'s and left delimiting key are
-	   not set correctly */
-	if (tb->CFL[0]) {
-	    if (!tb->CFR[0])
-		reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized");
-	    copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]));
-	    do_balance_mark_internal_dirty (tb, tb->CFL[0], 0);
-	}
-
-	reiserfs_invalidate_buffer(tb,tbS0);									
-	return 0;
-    }
-
-
-    /* Fill new nodes that appear in place of S[0] */
 
-    /* I am told that this copying is because we need an array to enable
-       the looping code. -Hans */
-    snum[0] = tb->s1num,
-	snum[1] = tb->s2num;
-    sbytes[0] = tb->s1bytes;
-    sbytes[1] = tb->s2bytes;
-    for( i = tb->blknum[0] - 2; i >= 0; i-- ) {
-
-	RFALSE( !snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i, snum[i]);
+	}
 
-	/* here we shift from S to S_new nodes */
+	/* tb->rnum[0] > 0 */
+	RFALSE(tb->blknum[0] > 3,
+	       "PAP-12180: blknum can not be %d. It must be <= 3",
+	       tb->blknum[0]);
+	RFALSE(tb->blknum[0] < 0,
+	       "PAP-12185: blknum can not be %d. It must be >= 0",
+	       tb->blknum[0]);
+
+	/* if while adding to a node we discover that it is possible to split
+	   it in two, and merge the left part into the left neighbor and the
+	   right part into the right neighbor, eliminating the node */
+	if (tb->blknum[0] == 0) {	/* node S[0] is empty now */
+
+		RFALSE(!tb->lnum[0] || !tb->rnum[0],
+		       "PAP-12190: lnum and rnum must not be zero");
+		/* if insertion was done before 0-th position in R[0], right
+		   delimiting key of the tb->L[0]'s and left delimiting key are
+		   not set correctly */
+		if (tb->CFL[0]) {
+			if (!tb->CFR[0])
+				reiserfs_panic(tb->tb_sb,
+					       "vs-12195: balance_leaf: CFR not initialized");
+			copy_key(B_N_PDELIM_KEY(tb->CFL[0], tb->lkey[0]),
+				 B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]));
+			do_balance_mark_internal_dirty(tb, tb->CFL[0], 0);
+		}
 
-	S_new[i] = get_FEB(tb);
+		reiserfs_invalidate_buffer(tb, tbS0);
+		return 0;
+	}
 
-	/* initialized block type and tree level */
-        set_blkh_level( B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL );
+	/* Fill new nodes that appear in place of S[0] */
+
+	/* I am told that this copying is because we need an array to enable
+	   the looping code. -Hans */
+	snum[0] = tb->s1num, snum[1] = tb->s2num;
+	sbytes[0] = tb->s1bytes;
+	sbytes[1] = tb->s2bytes;
+	for (i = tb->blknum[0] - 2; i >= 0; i--) {
+
+		RFALSE(!snum[i], "PAP-12200: snum[%d] == %d. Must be > 0", i,
+		       snum[i]);
+
+		/* here we shift from S to S_new nodes */
+
+		S_new[i] = get_FEB(tb);
+
+		/* initialized block type and tree level */
+		set_blkh_level(B_BLK_HEAD(S_new[i]), DISK_LEAF_NODE_LEVEL);
+
+		n = B_NR_ITEMS(tbS0);
+
+		switch (flag) {
+		case M_INSERT:	/* insert item */
+
+			if (n - snum[i] < item_pos) {	/* new item or it's part falls to first new node S_new[i] */
+				if (item_pos == n - snum[i] + 1 && sbytes[i] != -1) {	/* part of new item falls into S_new[i] */
+					int old_key_comp, old_len,
+					    r_zeros_number;
+					const char *r_body;
+					int version;
+
+					/* Move snum[i]-1 items from S[0] to S_new[i] */
+					leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+							snum[i] - 1, -1,
+							S_new[i]);
+					/* Remember key component and item length */
+					version = ih_version(ih);
+					old_key_comp = le_ih_k_offset(ih);
+					old_len = ih_item_len(ih);
+
+					/* Calculate key component and item length to insert into S_new[i] */
+					set_le_ih_k_offset(ih,
+							   le_ih_k_offset(ih) +
+							   ((old_len -
+							     sbytes[i]) <<
+							    (is_indirect_le_ih
+							     (ih) ? tb->tb_sb->
+							     s_blocksize_bits -
+							     UNFM_P_SHIFT :
+							     0)));
+
+					put_ih_item_len(ih, sbytes[i]);
+
+					/* Insert part of the item into S_new[i] before 0-th item */
+					bi.tb = tb;
+					bi.bi_bh = S_new[i];
+					bi.bi_parent = NULL;
+					bi.bi_position = 0;
+
+					if ((old_len - sbytes[i]) > zeros_num) {
+						r_zeros_number = 0;
+						r_body =
+						    body + (old_len -
+							    sbytes[i]) -
+						    zeros_num;
+					} else {
+						r_body = body;
+						r_zeros_number =
+						    zeros_num - (old_len -
+								 sbytes[i]);
+						zeros_num -= r_zeros_number;
+					}
+
+					leaf_insert_into_buf(&bi, 0, ih, r_body,
+							     r_zeros_number);
+
+					/* Calculate key component and item length to insert into S[i] */
+					set_le_ih_k_offset(ih, old_key_comp);
+					put_ih_item_len(ih,
+							old_len - sbytes[i]);
+					tb->insert_size[0] -= sbytes[i];
+				} else {	/* whole new item falls into S_new[i] */
+
+					/* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
+					leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+							snum[i] - 1, sbytes[i],
+							S_new[i]);
+
+					/* Insert new item into S_new[i] */
+					bi.tb = tb;
+					bi.bi_bh = S_new[i];
+					bi.bi_parent = NULL;
+					bi.bi_position = 0;
+					leaf_insert_into_buf(&bi,
+							     item_pos - n +
+							     snum[i] - 1, ih,
+							     body, zeros_num);
+
+					zeros_num = tb->insert_size[0] = 0;
+				}
+			}
 
+			else {	/* new item or it part don't falls into S_new[i] */
 
-	n = B_NR_ITEMS(tbS0);
-	
-	switch (flag) {
-	case M_INSERT:   /* insert item */
-
-	    if ( n - snum[i] < item_pos )
-	    { /* new item or it's part falls to first new node S_new[i]*/
-		if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 )
-		{ /* part of new item falls into S_new[i] */
-		    int old_key_comp, old_len, r_zeros_number;
-		    const char * r_body;
-		    int version;
-
-		    /* Move snum[i]-1 items from S[0] to S_new[i] */
-		    leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]);
-		    /* Remember key component and item length */
-		    version = ih_version (ih);
-                    old_key_comp = le_ih_k_offset( ih );
-		    old_len = ih_item_len(ih);
-
-		    /* Calculate key component and item length to insert into S_new[i] */
-                    set_le_ih_k_offset( ih,
-                                le_ih_k_offset(ih) + ((old_len - sbytes[i] )<<(is_indirect_le_ih(ih)?tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT:0)) );
-
-		    put_ih_item_len( ih, sbytes[i] );
-
-		    /* Insert part of the item into S_new[i] before 0-th item */
-		    bi.tb = tb;
-		    bi.bi_bh = S_new[i];
-		    bi.bi_parent = NULL;
-		    bi.bi_position = 0;
-
-		    if ( (old_len - sbytes[i]) > zeros_num ) {
-			r_zeros_number = 0;
-			r_body = body + (old_len - sbytes[i]) - zeros_num;
-		    }
-		    else {
-			r_body = body;
-			r_zeros_number = zeros_num - (old_len - sbytes[i]);
-			zeros_num -= r_zeros_number;
-		    }
-
-		    leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number);
-
-		    /* Calculate key component and item length to insert into S[i] */
-                    set_le_ih_k_offset( ih, old_key_comp );
-		    put_ih_item_len( ih, old_len - sbytes[i] );
-		    tb->insert_size[0] -= sbytes[i];
-		}
-		else /* whole new item falls into S_new[i] */
-		{
-		    /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */
-		    leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]);
-
-		    /* Insert new item into S_new[i] */
-		    bi.tb = tb;
-		    bi.bi_bh = S_new[i];
-		    bi.bi_parent = NULL;
-		    bi.bi_position = 0;
-		    leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num);
-
-		    zeros_num = tb->insert_size[0] = 0;
-		}
-	    }
-
-	    else /* new item or it part don't falls into S_new[i] */
-	    {
-		leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
-	    }
-	    break;
-
-	case M_PASTE:   /* append item */
-
-	    if ( n - snum[i] <= item_pos )  /* pasted item or part if it falls to S_new[i] */
-	    {
-		if ( item_pos == n - snum[i] && sbytes[i] != -1 )
-		{ /* we must shift part of the appended item */
-		    struct item_head * aux_ih;
-
-		    RFALSE( ih, "PAP-12210: ih must be 0");
-
-		    if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) {
-			/* we append to directory item */
-
-			int entry_count;
-		
-			entry_count = ih_entry_count(aux_ih);
-
-			if ( entry_count - sbytes[i] < pos_in_item  && pos_in_item <= entry_count ) {
-			    /* new directory entry falls into S_new[i] */
-		  
-			    RFALSE( ! tb->insert_size[0],
-				    "PAP-12215: insert_size is already 0");
-			    RFALSE( sbytes[i] - 1 >= entry_count,
-				    "PAP-12220: there are no so much entries (%d), only %d",
-				    sbytes[i] - 1, entry_count);
-
-			    /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
-			    leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]);
-			    /* Paste given directory entry to directory item */
-			    bi.tb = tb;
-			    bi.bi_bh = S_new[i];
-			    bi.bi_parent = NULL;
-			    bi.bi_position = 0;
-			    leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1,
-						  tb->insert_size[0], body,zeros_num);
-			    /* paste new directory entry */
-			    leaf_paste_entries (
-				bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1,
-				1, (struct reiserfs_de_head *)body, body + DEH_SIZE,
-				tb->insert_size[0]
-				);
-			    tb->insert_size[0] = 0;
-			    pos_in_item++;
-			} else { /* new directory entry doesn't fall into S_new[i] */
-			    leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
+				leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+						snum[i], sbytes[i], S_new[i]);
 			}
-		    }
-		    else /* regular object */
-		    {
-			int n_shift, n_rem, r_zeros_number;
-			const char * r_body;
-
-			RFALSE( pos_in_item != ih_item_len(B_N_PITEM_HEAD(tbS0,item_pos)) ||
-			        tb->insert_size[0] <= 0,
-			        "PAP-12225: item too short or insert_size <= 0");
-
-			/* Calculate number of bytes which must be shifted from appended item */
-			n_shift = sbytes[i] - tb->insert_size[0];
-			if ( n_shift < 0 )
-			    n_shift = 0;
-			leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]);
-
-			/* Calculate number of bytes which must remain in body after append to S_new[i] */
-			n_rem = tb->insert_size[0] - sbytes[i];
-			if ( n_rem < 0 )
-			    n_rem = 0;
-			/* Append part of body into S_new[0] */
-			bi.tb = tb;
-			bi.bi_bh = S_new[i];
-			bi.bi_parent = NULL;
-			bi.bi_position = 0;
+			break;
+
+		case M_PASTE:	/* append item */
+
+			if (n - snum[i] <= item_pos) {	/* pasted item or part if it falls to S_new[i] */
+				if (item_pos == n - snum[i] && sbytes[i] != -1) {	/* we must shift part of the appended item */
+					struct item_head *aux_ih;
+
+					RFALSE(ih, "PAP-12210: ih must be 0");
+
+					if (is_direntry_le_ih
+					    (aux_ih =
+					     B_N_PITEM_HEAD(tbS0, item_pos))) {
+						/* we append to directory item */
+
+						int entry_count;
+
+						entry_count =
+						    ih_entry_count(aux_ih);
+
+						if (entry_count - sbytes[i] <
+						    pos_in_item
+						    && pos_in_item <=
+						    entry_count) {
+							/* new directory entry falls into S_new[i] */
+
+							RFALSE(!tb->
+							       insert_size[0],
+							       "PAP-12215: insert_size is already 0");
+							RFALSE(sbytes[i] - 1 >=
+							       entry_count,
+							       "PAP-12220: there are no so much entries (%d), only %d",
+							       sbytes[i] - 1,
+							       entry_count);
+
+							/* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */
+							leaf_move_items
+							    (LEAF_FROM_S_TO_SNEW,
+							     tb, snum[i],
+							     sbytes[i] - 1,
+							     S_new[i]);
+							/* Paste given directory entry to directory item */
+							bi.tb = tb;
+							bi.bi_bh = S_new[i];
+							bi.bi_parent = NULL;
+							bi.bi_position = 0;
+							leaf_paste_in_buffer
+							    (&bi, 0,
+							     pos_in_item -
+							     entry_count +
+							     sbytes[i] - 1,
+							     tb->insert_size[0],
+							     body, zeros_num);
+							/* paste new directory entry */
+							leaf_paste_entries(bi.
+									   bi_bh,
+									   0,
+									   pos_in_item
+									   -
+									   entry_count
+									   +
+									   sbytes
+									   [i] -
+									   1, 1,
+									   (struct
+									    reiserfs_de_head
+									    *)
+									   body,
+									   body
+									   +
+									   DEH_SIZE,
+									   tb->
+									   insert_size
+									   [0]
+							    );
+							tb->insert_size[0] = 0;
+							pos_in_item++;
+						} else {	/* new directory entry doesn't fall into S_new[i] */
+							leaf_move_items
+							    (LEAF_FROM_S_TO_SNEW,
+							     tb, snum[i],
+							     sbytes[i],
+							     S_new[i]);
+						}
+					} else {	/* regular object */
+
+						int n_shift, n_rem,
+						    r_zeros_number;
+						const char *r_body;
+
+						RFALSE(pos_in_item !=
+						       ih_item_len
+						       (B_N_PITEM_HEAD
+							(tbS0, item_pos))
+						       || tb->insert_size[0] <=
+						       0,
+						       "PAP-12225: item too short or insert_size <= 0");
+
+						/* Calculate number of bytes which must be shifted from appended item */
+						n_shift =
+						    sbytes[i] -
+						    tb->insert_size[0];
+						if (n_shift < 0)
+							n_shift = 0;
+						leaf_move_items
+						    (LEAF_FROM_S_TO_SNEW, tb,
+						     snum[i], n_shift,
+						     S_new[i]);
+
+						/* Calculate number of bytes which must remain in body after append to S_new[i] */
+						n_rem =
+						    tb->insert_size[0] -
+						    sbytes[i];
+						if (n_rem < 0)
+							n_rem = 0;
+						/* Append part of body into S_new[0] */
+						bi.tb = tb;
+						bi.bi_bh = S_new[i];
+						bi.bi_parent = NULL;
+						bi.bi_position = 0;
+
+						if (n_rem > zeros_num) {
+							r_zeros_number = 0;
+							r_body =
+							    body + n_rem -
+							    zeros_num;
+						} else {
+							r_body = body;
+							r_zeros_number =
+							    zeros_num - n_rem;
+							zeros_num -=
+							    r_zeros_number;
+						}
+
+						leaf_paste_in_buffer(&bi, 0,
+								     n_shift,
+								     tb->
+								     insert_size
+								     [0] -
+								     n_rem,
+								     r_body,
+								     r_zeros_number);
+						{
+							struct item_head *tmp;
+
+							tmp =
+							    B_N_PITEM_HEAD(S_new
+									   [i],
+									   0);
+							if (is_indirect_le_ih
+							    (tmp)) {
+								set_ih_free_space
+								    (tmp, 0);
+								set_le_ih_k_offset
+								    (tmp,
+								     le_ih_k_offset
+								     (tmp) +
+								     (n_rem <<
+								      (tb->
+								       tb_sb->
+								       s_blocksize_bits
+								       -
+								       UNFM_P_SHIFT)));
+							} else {
+								set_le_ih_k_offset
+								    (tmp,
+								     le_ih_k_offset
+								     (tmp) +
+								     n_rem);
+							}
+						}
+
+						tb->insert_size[0] = n_rem;
+						if (!n_rem)
+							pos_in_item++;
+					}
+				} else
+					/* item falls wholly into S_new[i] */
+				{
+					int ret_val;
+					struct item_head *pasted;
 
-			if ( n_rem > zeros_num ) {
-			    r_zeros_number = 0;
-			    r_body = body + n_rem - zeros_num;
-			}
-			else {
-			    r_body = body;
-			    r_zeros_number = zeros_num - n_rem;
-			    zeros_num -= r_zeros_number;
+#ifdef CONFIG_REISERFS_CHECK
+					struct item_head *ih =
+					    B_N_PITEM_HEAD(tbS0, item_pos);
+
+					if (!is_direntry_le_ih(ih)
+					    && (pos_in_item != ih_item_len(ih)
+						|| tb->insert_size[0] <= 0))
+						reiserfs_panic(tb->tb_sb,
+							       "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
+#endif				/* CONFIG_REISERFS_CHECK */
+
+					ret_val =
+					    leaf_move_items(LEAF_FROM_S_TO_SNEW,
+							    tb, snum[i],
+							    sbytes[i],
+							    S_new[i]);
+
+					RFALSE(ret_val,
+					       "PAP-12240: unexpected value returned by leaf_move_items (%d)",
+					       ret_val);
+
+					/* paste into item */
+					bi.tb = tb;
+					bi.bi_bh = S_new[i];
+					bi.bi_parent = NULL;
+					bi.bi_position = 0;
+					leaf_paste_in_buffer(&bi,
+							     item_pos - n +
+							     snum[i],
+							     pos_in_item,
+							     tb->insert_size[0],
+							     body, zeros_num);
+
+					pasted =
+					    B_N_PITEM_HEAD(S_new[i],
+							   item_pos - n +
+							   snum[i]);
+					if (is_direntry_le_ih(pasted)) {
+						leaf_paste_entries(bi.bi_bh,
+								   item_pos -
+								   n + snum[i],
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+					}
+
+					/* if we paste to indirect item update ih_free_space */
+					if (is_indirect_le_ih(pasted))
+						set_ih_free_space(pasted, 0);
+					zeros_num = tb->insert_size[0] = 0;
+				}
 			}
 
-			leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number);
-			{
-			    struct item_head * tmp;
-
-			    tmp = B_N_PITEM_HEAD(S_new[i],0);
-			    if (is_indirect_le_ih (tmp)) {
-				set_ih_free_space (tmp, 0);
-				set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + 
-					            (n_rem << (tb->tb_sb->s_blocksize_bits - UNFM_P_SHIFT)));
-			    } else {
-				set_le_ih_k_offset( tmp, le_ih_k_offset(tmp) + 
-				                    n_rem );
-			    }
-			}
+			else {	/* pasted item doesn't fall into S_new[i] */
 
-			tb->insert_size[0] = n_rem;
-			if ( ! n_rem )
-			    pos_in_item++;
-		    }
+				leaf_move_items(LEAF_FROM_S_TO_SNEW, tb,
+						snum[i], sbytes[i], S_new[i]);
+			}
+			break;
+		default:	/* cases d and t */
+			reiserfs_panic(tb->tb_sb,
+				       "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
+				       (flag ==
+					M_DELETE) ? "DELETE" : ((flag ==
+								 M_CUT) ? "CUT"
+								: "UNKNOWN"),
+				       flag);
 		}
-		else
-		    /* item falls wholly into S_new[i] */
-		{
-		    int ret_val;
-		    struct item_head * pasted;
 
-#ifdef CONFIG_REISERFS_CHECK
-		    struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos);
-
-		    if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih_item_len(ih) ||
-						     tb->insert_size[0] <= 0) )
-			reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len");
-#endif /* CONFIG_REISERFS_CHECK */
-
-		    ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
-
-		    RFALSE( ret_val,
-			    "PAP-12240: unexpected value returned by leaf_move_items (%d)",
-			    ret_val);
-
-		    /* paste into item */
-		    bi.tb = tb;
-		    bi.bi_bh = S_new[i];
-		    bi.bi_parent = NULL;
-		    bi.bi_position = 0;
-		    leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num);
-
-		    pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]);
-		    if (is_direntry_le_ih (pasted))
-		    {
-			leaf_paste_entries (
-			    bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1, 
-			    (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0]
-			    );
-		    }
-
-		    /* if we paste to indirect item update ih_free_space */
-		    if (is_indirect_le_ih (pasted))
-			set_ih_free_space (pasted, 0);
-		    zeros_num = tb->insert_size[0] = 0;
-		}
-	    }
-
-	    else /* pasted item doesn't fall into S_new[i] */
-	    {
-		leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]);
-	    }
-	    break;
-	default:    /* cases d and t */
-	    reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)",
-			    (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag);
+		memcpy(insert_key + i, B_N_PKEY(S_new[i], 0), KEY_SIZE);
+		insert_ptr[i] = S_new[i];
+
+		RFALSE(!buffer_journaled(S_new[i])
+		       || buffer_journal_dirty(S_new[i])
+		       || buffer_dirty(S_new[i]), "PAP-12247: S_new[%d] : (%b)",
+		       i, S_new[i]);
 	}
 
-	memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE);
-	insert_ptr[i] = S_new[i];
-
-	RFALSE (!buffer_journaled (S_new [i]) || buffer_journal_dirty (S_new [i]) ||
-		buffer_dirty (S_new [i]),
-		"PAP-12247: S_new[%d] : (%b)", i, S_new[i]);
-    }
-
-    /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
-       affected item which remains in S */
-    if ( 0 <= item_pos && item_pos < tb->s0num )
-    { /* if we must insert or append into buffer S[0] */
-
-	switch (flag)
-	{
-	case M_INSERT:   /* insert item into S[0] */
-	    bi.tb = tb;
-	    bi.bi_bh = tbS0;
-	    bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	    bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
-	    leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num);
-
-	    /* If we insert the first key change the delimiting key */
-	    if( item_pos == 0 ) {
-		if (tb->CFL[0]) /* can be 0 in reiserfsck */
-		    replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
-
-	    }
-	    break;
-
-	case M_PASTE: {  /* append item in S[0] */
-	    struct item_head * pasted;
-
-	    pasted = B_N_PITEM_HEAD (tbS0, item_pos);
-	    /* when directory, may be new entry already pasted */
-	    if (is_direntry_le_ih (pasted)) {
-		if ( pos_in_item >= 0 &&
-		    pos_in_item <= ih_entry_count(pasted) ) {
-
-		    RFALSE( ! tb->insert_size[0], 
-			    "PAP-12260: insert_size is 0 already");
-
-		    /* prepare space */
-		    bi.tb = tb;
-		    bi.bi_bh = tbS0;
-		    bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-		    bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
-		    leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
-
-		    /* paste entry */
-		    leaf_paste_entries (
-			bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body,
-			body + DEH_SIZE, tb->insert_size[0]
-			);
-		    if ( ! item_pos && ! pos_in_item ) {
-			RFALSE( !tb->CFL[0] || !tb->L[0], 
-				"PAP-12270: CFL[0]/L[0] must be specified");
-			if (tb->CFL[0]) {
-			    replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0);
+	/* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the
+	   affected item which remains in S */
+	if (0 <= item_pos && item_pos < tb->s0num) {	/* if we must insert or append into buffer S[0] */
+
+		switch (flag) {
+		case M_INSERT:	/* insert item into S[0] */
+			bi.tb = tb;
+			bi.bi_bh = tbS0;
+			bi.bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+			bi.bi_position = PATH_H_POSITION(tb->tb_path, 1);
+			leaf_insert_into_buf(&bi, item_pos, ih, body,
+					     zeros_num);
+
+			/* If we insert the first key change the delimiting key */
+			if (item_pos == 0) {
+				if (tb->CFL[0])	/* can be 0 in reiserfsck */
+					replace_key(tb, tb->CFL[0], tb->lkey[0],
+						    tbS0, 0);
 
 			}
-		    }
-		    tb->insert_size[0] = 0;
-		}
-	    } else { /* regular object */
-		if ( pos_in_item == ih_item_len(pasted) ) {
-
-		    RFALSE( tb->insert_size[0] <= 0,
-			    "PAP-12275: insert size must not be %d",
-                            tb->insert_size[0]);
-		    bi.tb = tb;
-		    bi.bi_bh = tbS0;
-		    bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-		    bi.bi_position = PATH_H_POSITION (tb->tb_path, 1);
-		    leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num);
-
-		    if (is_indirect_le_ih (pasted)) {
+			break;
+
+		case M_PASTE:{	/* append item in S[0] */
+				struct item_head *pasted;
+
+				pasted = B_N_PITEM_HEAD(tbS0, item_pos);
+				/* when directory, may be new entry already pasted */
+				if (is_direntry_le_ih(pasted)) {
+					if (pos_in_item >= 0 &&
+					    pos_in_item <=
+					    ih_entry_count(pasted)) {
+
+						RFALSE(!tb->insert_size[0],
+						       "PAP-12260: insert_size is 0 already");
+
+						/* prepare space */
+						bi.tb = tb;
+						bi.bi_bh = tbS0;
+						bi.bi_parent =
+						    PATH_H_PPARENT(tb->tb_path,
+								   0);
+						bi.bi_position =
+						    PATH_H_POSITION(tb->tb_path,
+								    1);
+						leaf_paste_in_buffer(&bi,
+								     item_pos,
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+
+						/* paste entry */
+						leaf_paste_entries(bi.bi_bh,
+								   item_pos,
+								   pos_in_item,
+								   1,
+								   (struct
+								    reiserfs_de_head
+								    *)body,
+								   body +
+								   DEH_SIZE,
+								   tb->
+								   insert_size
+								   [0]
+						    );
+						if (!item_pos && !pos_in_item) {
+							RFALSE(!tb->CFL[0]
+							       || !tb->L[0],
+							       "PAP-12270: CFL[0]/L[0] must be specified");
+							if (tb->CFL[0]) {
+								replace_key(tb,
+									    tb->
+									    CFL
+									    [0],
+									    tb->
+									    lkey
+									    [0],
+									    tbS0,
+									    0);
+
+							}
+						}
+						tb->insert_size[0] = 0;
+					}
+				} else {	/* regular object */
+					if (pos_in_item == ih_item_len(pasted)) {
+
+						RFALSE(tb->insert_size[0] <= 0,
+						       "PAP-12275: insert size must not be %d",
+						       tb->insert_size[0]);
+						bi.tb = tb;
+						bi.bi_bh = tbS0;
+						bi.bi_parent =
+						    PATH_H_PPARENT(tb->tb_path,
+								   0);
+						bi.bi_position =
+						    PATH_H_POSITION(tb->tb_path,
+								    1);
+						leaf_paste_in_buffer(&bi,
+								     item_pos,
+								     pos_in_item,
+								     tb->
+								     insert_size
+								     [0], body,
+								     zeros_num);
+
+						if (is_indirect_le_ih(pasted)) {
 #if 0
-			RFALSE( tb->insert_size[0] != UNFM_P_SIZE,
-				"PAP-12280: insert_size for indirect item must be %d, not %d",
-				UNFM_P_SIZE, tb->insert_size[0]);
+							RFALSE(tb->
+							       insert_size[0] !=
+							       UNFM_P_SIZE,
+							       "PAP-12280: insert_size for indirect item must be %d, not %d",
+							       UNFM_P_SIZE,
+							       tb->
+							       insert_size[0]);
 #endif
-			set_ih_free_space (pasted, 0);
-		    }
-		    tb->insert_size[0] = 0;
-		}
-
+							set_ih_free_space
+							    (pasted, 0);
+						}
+						tb->insert_size[0] = 0;
+					}
 #ifdef CONFIG_REISERFS_CHECK
-		else {
-		    if ( tb->insert_size[0] ) {
-			print_cur_tb ("12285");
-			reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]);
-		    }
+					else {
+						if (tb->insert_size[0]) {
+							print_cur_tb("12285");
+							reiserfs_panic(tb->
+								       tb_sb,
+								       "PAP-12285: balance_leaf: insert_size must be 0 (%d)",
+								       tb->
+								       insert_size
+								       [0]);
+						}
+					}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+				}
+			}	/* case M_PASTE: */
 		}
-#endif /* CONFIG_REISERFS_CHECK */
-	    
-	    }
-	} /* case M_PASTE: */
 	}
-    }
-
 #ifdef CONFIG_REISERFS_CHECK
-    if ( flag == M_PASTE && tb->insert_size[0] ) {
-	print_cur_tb ("12290");
-	reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]);
-    }
-#endif /* CONFIG_REISERFS_CHECK */
-
-    return 0;
-} /* Leaf level of the tree is balanced (end of balance_leaf) */
-
+	if (flag == M_PASTE && tb->insert_size[0]) {
+		print_cur_tb("12290");
+		reiserfs_panic(tb->tb_sb,
+			       "PAP-12290: balance_leaf: insert_size is still not 0 (%d)",
+			       tb->insert_size[0]);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
 
+	return 0;
+}				/* Leaf level of the tree is balanced (end of balance_leaf) */
 
 /* Make empty node */
-void make_empty_node (struct buffer_info * bi)
+void make_empty_node(struct buffer_info *bi)
 {
-    struct block_head * blkh;
+	struct block_head *blkh;
 
-    RFALSE( bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
+	RFALSE(bi->bi_bh == NULL, "PAP-12295: pointer to the buffer is NULL");
 
-    blkh = B_BLK_HEAD(bi->bi_bh);
-    set_blkh_nr_item( blkh, 0 );
-    set_blkh_free_space( blkh, MAX_CHILD_SIZE(bi->bi_bh) );
+	blkh = B_BLK_HEAD(bi->bi_bh);
+	set_blkh_nr_item(blkh, 0);
+	set_blkh_free_space(blkh, MAX_CHILD_SIZE(bi->bi_bh));
 
-    if (bi->bi_parent)
-	B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; /* Endian safe if 0 */
+	if (bi->bi_parent)
+		B_N_CHILD(bi->bi_parent, bi->bi_position)->dc_size = 0;	/* Endian safe if 0 */
 }
 
-
 /* Get first empty buffer */
-struct buffer_head * get_FEB (struct tree_balance * tb)
+struct buffer_head *get_FEB(struct tree_balance *tb)
 {
-    int i;
-    struct buffer_head * first_b;
-    struct buffer_info bi;
-
-    for (i = 0; i < MAX_FEB_SIZE; i ++)
-	if (tb->FEB[i] != 0)
-	    break;
-
-    if (i == MAX_FEB_SIZE)
-	reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty");
-
-    bi.tb = tb;
-    bi.bi_bh = first_b = tb->FEB[i];
-    bi.bi_parent = NULL;
-    bi.bi_position = 0;
-    make_empty_node (&bi);
-    set_buffer_uptodate(first_b);
-    tb->FEB[i] = NULL;
-    tb->used[i] = first_b;
-
-    return(first_b);
-}
+	int i;
+	struct buffer_head *first_b;
+	struct buffer_info bi;
 
+	for (i = 0; i < MAX_FEB_SIZE; i++)
+		if (tb->FEB[i] != 0)
+			break;
+
+	if (i == MAX_FEB_SIZE)
+		reiserfs_panic(tb->tb_sb,
+			       "vs-12300: get_FEB: FEB list is empty");
+
+	bi.tb = tb;
+	bi.bi_bh = first_b = tb->FEB[i];
+	bi.bi_parent = NULL;
+	bi.bi_position = 0;
+	make_empty_node(&bi);
+	set_buffer_uptodate(first_b);
+	tb->FEB[i] = NULL;
+	tb->used[i] = first_b;
+
+	return (first_b);
+}
 
 /* This is now used because reiserfs_free_block has to be able to
 ** schedule.
 */
-static void store_thrown (struct tree_balance * tb, struct buffer_head * bh)
+static void store_thrown(struct tree_balance *tb, struct buffer_head *bh)
 {
-    int i;
-
-    if (buffer_dirty (bh))
-      reiserfs_warning (tb->tb_sb, "store_thrown deals with dirty buffer");
-    for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++)
-	if (!tb->thrown[i]) {
-	    tb->thrown[i] = bh;
-	    get_bh(bh) ; /* free_thrown puts this */
-	    return;
-	}
-    reiserfs_warning (tb->tb_sb, "store_thrown: too many thrown buffers");
+	int i;
+
+	if (buffer_dirty(bh))
+		reiserfs_warning(tb->tb_sb,
+				 "store_thrown deals with dirty buffer");
+	for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++)
+		if (!tb->thrown[i]) {
+			tb->thrown[i] = bh;
+			get_bh(bh);	/* free_thrown puts this */
+			return;
+		}
+	reiserfs_warning(tb->tb_sb, "store_thrown: too many thrown buffers");
 }
 
-static void free_thrown(struct tree_balance *tb) {
-    int i ;
-    b_blocknr_t blocknr ;
-    for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) {
-	if (tb->thrown[i]) {
-	    blocknr = tb->thrown[i]->b_blocknr ;
-	    if (buffer_dirty (tb->thrown[i]))
-	      reiserfs_warning (tb->tb_sb,
-				"free_thrown deals with dirty buffer %d",
-				blocknr);
-	    brelse(tb->thrown[i]) ; /* incremented in store_thrown */
-	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
+static void free_thrown(struct tree_balance *tb)
+{
+	int i;
+	b_blocknr_t blocknr;
+	for (i = 0; i < sizeof(tb->thrown) / sizeof(tb->thrown[0]); i++) {
+		if (tb->thrown[i]) {
+			blocknr = tb->thrown[i]->b_blocknr;
+			if (buffer_dirty(tb->thrown[i]))
+				reiserfs_warning(tb->tb_sb,
+						 "free_thrown deals with dirty buffer %d",
+						 blocknr);
+			brelse(tb->thrown[i]);	/* incremented in store_thrown */
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
 	}
-    }
 }
 
-void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh)
+void reiserfs_invalidate_buffer(struct tree_balance *tb, struct buffer_head *bh)
 {
-    struct block_head *blkh;
-    blkh = B_BLK_HEAD(bh);
-    set_blkh_level( blkh, FREE_LEVEL );
-    set_blkh_nr_item( blkh, 0 );
-    
-    clear_buffer_dirty(bh);
-    store_thrown (tb, bh);
+	struct block_head *blkh;
+	blkh = B_BLK_HEAD(bh);
+	set_blkh_level(blkh, FREE_LEVEL);
+	set_blkh_nr_item(blkh, 0);
+
+	clear_buffer_dirty(bh);
+	store_thrown(tb, bh);
 }
 
 /* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/
-void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest,
-		  struct buffer_head * src, int n_src)
+void replace_key(struct tree_balance *tb, struct buffer_head *dest, int n_dest,
+		 struct buffer_head *src, int n_src)
 {
 
-    RFALSE( dest == NULL || src == NULL,
-	    "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
-	    src, dest);
-    RFALSE( ! B_IS_KEYS_LEVEL (dest),
-	    "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
-	    dest);
-    RFALSE( n_dest < 0 || n_src < 0,
-	    "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
-    RFALSE( n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
-	    "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
-	    n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
-   
-    if (B_IS_ITEMS_LEVEL (src))
-	/* source buffer contains leaf node */
-	memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE);
-    else
-	memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE);
-
-    do_balance_mark_internal_dirty (tb, dest, 0);
+	RFALSE(dest == NULL || src == NULL,
+	       "vs-12305: source or destination buffer is 0 (src=%p, dest=%p)",
+	       src, dest);
+	RFALSE(!B_IS_KEYS_LEVEL(dest),
+	       "vs-12310: invalid level (%z) for destination buffer. dest must be leaf",
+	       dest);
+	RFALSE(n_dest < 0 || n_src < 0,
+	       "vs-12315: src(%d) or dest(%d) key number < 0", n_src, n_dest);
+	RFALSE(n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src),
+	       "vs-12320: src(%d(%d)) or dest(%d(%d)) key number is too big",
+	       n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest));
+
+	if (B_IS_ITEMS_LEVEL(src))
+		/* source buffer contains leaf node */
+		memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PITEM_HEAD(src, n_src),
+		       KEY_SIZE);
+	else
+		memcpy(B_N_PDELIM_KEY(dest, n_dest), B_N_PDELIM_KEY(src, n_src),
+		       KEY_SIZE);
+
+	do_balance_mark_internal_dirty(tb, dest, 0);
 }
 
-
-int get_left_neighbor_position (
-				struct tree_balance * tb, 
-				int h
-				)
+int get_left_neighbor_position(struct tree_balance *tb, int h)
 {
-  int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
 
-  RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0,
-	  "vs-12325: FL[%d](%p) or F[%d](%p) does not exist", 
-	  h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h));
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FL[h] == 0,
+	       "vs-12325: FL[%d](%p) or F[%d](%p) does not exist",
+	       h, tb->FL[h], h, PATH_H_PPARENT(tb->tb_path, h));
 
-  if (Sh_position == 0)
-    return B_NR_ITEMS (tb->FL[h]);
-  else
-    return Sh_position - 1;
+	if (Sh_position == 0)
+		return B_NR_ITEMS(tb->FL[h]);
+	else
+		return Sh_position - 1;
 }
 
-
-int get_right_neighbor_position (struct tree_balance * tb, int h)
+int get_right_neighbor_position(struct tree_balance *tb, int h)
 {
-  int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1);
+	int Sh_position = PATH_H_POSITION(tb->tb_path, h + 1);
 
-  RFALSE( PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0,
-	  "vs-12330: F[%d](%p) or FR[%d](%p) does not exist", 
-	  h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]);
+	RFALSE(PATH_H_PPARENT(tb->tb_path, h) == 0 || tb->FR[h] == 0,
+	       "vs-12330: F[%d](%p) or FR[%d](%p) does not exist",
+	       h, PATH_H_PPARENT(tb->tb_path, h), h, tb->FR[h]);
 
-  if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h)))
-    return 0;
-  else
-    return Sh_position + 1;
+	if (Sh_position == B_NR_ITEMS(PATH_H_PPARENT(tb->tb_path, h)))
+		return 0;
+	else
+		return Sh_position + 1;
 }
 
-
 #ifdef CONFIG_REISERFS_CHECK
 
-int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
-static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes)
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+static void check_internal_node(struct super_block *s, struct buffer_head *bh,
+				char *mes)
 {
-  struct disk_child * dc;
-  int i;
-
-  RFALSE( !bh, "PAP-12336: bh == 0");
-
-  if (!bh || !B_IS_IN_TREE (bh))
-    return;
- 
-  RFALSE( !buffer_dirty (bh) && 
-	  !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
-	  "PAP-12337: buffer (%b) must be dirty", bh);
-  dc = B_N_CHILD (bh, 0);
-
-  for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) {
-    if (!is_reusable (s, dc_block_number(dc), 1) ) {
-      print_cur_tb (mes);
-      reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh);
-    }
-  }
-}
+	struct disk_child *dc;
+	int i;
 
+	RFALSE(!bh, "PAP-12336: bh == 0");
 
-static int locked_or_not_in_tree (struct buffer_head * bh, char * which)
-{
-  if ( (!buffer_journal_prepared (bh) && buffer_locked (bh)) ||
-        !B_IS_IN_TREE (bh) ) {
-    reiserfs_warning (NULL, "vs-12339: locked_or_not_in_tree: %s (%b)",
-                      which, bh);
-    return 1;
-  } 
-  return 0;
-}
+	if (!bh || !B_IS_IN_TREE(bh))
+		return;
 
+	RFALSE(!buffer_dirty(bh) &&
+	       !(buffer_journaled(bh) || buffer_journal_dirty(bh)),
+	       "PAP-12337: buffer (%b) must be dirty", bh);
+	dc = B_N_CHILD(bh, 0);
 
-static int check_before_balancing (struct tree_balance * tb)
-{
-  int retval = 0;	
-
-  if ( cur_tb ) {
-    reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: "
-		    "suspect that schedule occurred based on cur_tb not being null at this point in code. "
-		    "do_balance cannot properly handle schedule occurring while it runs.");
-  }
-  
-  /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
-     prepped all of these for us). */
-  if ( tb->lnum[0] ) {
-    retval |= locked_or_not_in_tree (tb->L[0], "L[0]");
-    retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]");
-    retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]");
-    check_leaf (tb->L[0]);
-  }
-  if ( tb->rnum[0] ) {
-    retval |= locked_or_not_in_tree (tb->R[0], "R[0]");
-    retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]");
-    retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]");
-    check_leaf (tb->R[0]);
-  }
-  retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]");
-  check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
-
-  return retval;
+	for (i = 0; i <= B_NR_ITEMS(bh); i++, dc++) {
+		if (!is_reusable(s, dc_block_number(dc), 1)) {
+			print_cur_tb(mes);
+			reiserfs_panic(s,
+				       "PAP-12338: check_internal_node: invalid child pointer %y in %b",
+				       dc, bh);
+		}
+	}
 }
 
+static int locked_or_not_in_tree(struct buffer_head *bh, char *which)
+{
+	if ((!buffer_journal_prepared(bh) && buffer_locked(bh)) ||
+	    !B_IS_IN_TREE(bh)) {
+		reiserfs_warning(NULL,
+				 "vs-12339: locked_or_not_in_tree: %s (%b)",
+				 which, bh);
+		return 1;
+	}
+	return 0;
+}
 
-static void check_after_balance_leaf (struct tree_balance * tb)
+static int check_before_balancing(struct tree_balance *tb)
 {
-    if (tb->lnum[0]) {
-	if (B_FREE_SPACE (tb->L[0]) != 
-	    MAX_CHILD_SIZE (tb->L[0]) - dc_size(B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0)))) {
-	    print_cur_tb ("12221");
-	    reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
+	int retval = 0;
+
+	if (cur_tb) {
+		reiserfs_panic(tb->tb_sb, "vs-12335: check_before_balancing: "
+			       "suspect that schedule occurred based on cur_tb not being null at this point in code. "
+			       "do_balance cannot properly handle schedule occurring while it runs.");
 	}
-    }
-    if (tb->rnum[0]) {
-	if (B_FREE_SPACE (tb->R[0]) != 
-	    MAX_CHILD_SIZE (tb->R[0]) - dc_size(B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0)))) {
-	    print_cur_tb ("12222");
-	    reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
+
+	/* double check that buffers that we will modify are unlocked. (fix_nodes should already have
+	   prepped all of these for us). */
+	if (tb->lnum[0]) {
+		retval |= locked_or_not_in_tree(tb->L[0], "L[0]");
+		retval |= locked_or_not_in_tree(tb->FL[0], "FL[0]");
+		retval |= locked_or_not_in_tree(tb->CFL[0], "CFL[0]");
+		check_leaf(tb->L[0]);
 	}
-    }
-    if (PATH_H_PBUFFER(tb->tb_path,1) &&
-	(B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) != 
-		    (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
-		    dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
-		    PATH_H_POSITION (tb->tb_path, 1)))) )) {
-	int left = B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0));
-	int right = (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) -
-		    dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1),
-			PATH_H_POSITION (tb->tb_path, 1))));
-	print_cur_tb ("12223");
-	reiserfs_warning (tb->tb_sb,
-	    "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
-    	    "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
-	    left,
-	    MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)),
-	    PATH_H_PBUFFER(tb->tb_path,1),
-	    PATH_H_POSITION (tb->tb_path, 1),
-	    dc_size(B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), PATH_H_POSITION (tb->tb_path, 1 )) ),
-	    right );
-	reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect");
-    }
-}
+	if (tb->rnum[0]) {
+		retval |= locked_or_not_in_tree(tb->R[0], "R[0]");
+		retval |= locked_or_not_in_tree(tb->FR[0], "FR[0]");
+		retval |= locked_or_not_in_tree(tb->CFR[0], "CFR[0]");
+		check_leaf(tb->R[0]);
+	}
+	retval |= locked_or_not_in_tree(PATH_PLAST_BUFFER(tb->tb_path), "S[0]");
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
 
+	return retval;
+}
 
-static void check_leaf_level (struct tree_balance * tb)
+static void check_after_balance_leaf(struct tree_balance *tb)
 {
-  check_leaf (tb->L[0]);
-  check_leaf (tb->R[0]);
-  check_leaf (PATH_PLAST_BUFFER (tb->tb_path));
+	if (tb->lnum[0]) {
+		if (B_FREE_SPACE(tb->L[0]) !=
+		    MAX_CHILD_SIZE(tb->L[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FL[0], get_left_neighbor_position(tb, 0)))) {
+			print_cur_tb("12221");
+			reiserfs_panic(tb->tb_sb,
+				       "PAP-12355: check_after_balance_leaf: shift to left was incorrect");
+		}
+	}
+	if (tb->rnum[0]) {
+		if (B_FREE_SPACE(tb->R[0]) !=
+		    MAX_CHILD_SIZE(tb->R[0]) -
+		    dc_size(B_N_CHILD
+			    (tb->FR[0], get_right_neighbor_position(tb, 0)))) {
+			print_cur_tb("12222");
+			reiserfs_panic(tb->tb_sb,
+				       "PAP-12360: check_after_balance_leaf: shift to right was incorrect");
+		}
+	}
+	if (PATH_H_PBUFFER(tb->tb_path, 1) &&
+	    (B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0)) !=
+	     (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+	      dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+				PATH_H_POSITION(tb->tb_path, 1)))))) {
+		int left = B_FREE_SPACE(PATH_H_PBUFFER(tb->tb_path, 0));
+		int right = (MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)) -
+			     dc_size(B_N_CHILD(PATH_H_PBUFFER(tb->tb_path, 1),
+					       PATH_H_POSITION(tb->tb_path,
+							       1))));
+		print_cur_tb("12223");
+		reiserfs_warning(tb->tb_sb,
+				 "B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) = %d; "
+				 "MAX_CHILD_SIZE (%d) - dc_size( %y, %d ) [%d] = %d",
+				 left,
+				 MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, 0)),
+				 PATH_H_PBUFFER(tb->tb_path, 1),
+				 PATH_H_POSITION(tb->tb_path, 1),
+				 dc_size(B_N_CHILD
+					 (PATH_H_PBUFFER(tb->tb_path, 1),
+					  PATH_H_POSITION(tb->tb_path, 1))),
+				 right);
+		reiserfs_panic(tb->tb_sb,
+			       "PAP-12365: check_after_balance_leaf: S is incorrect");
+	}
 }
 
-static void check_internal_levels (struct tree_balance * tb)
+static void check_leaf_level(struct tree_balance *tb)
 {
-  int h;
+	check_leaf(tb->L[0]);
+	check_leaf(tb->R[0]);
+	check_leaf(PATH_PLAST_BUFFER(tb->tb_path));
+}
 
-  /* check all internal nodes */
-  for (h = 1; tb->insert_size[h]; h ++) {
-    check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH");
-    if (tb->lnum[h])
-      check_internal_node (tb->tb_sb, tb->L[h], "BAD L");
-    if (tb->rnum[h])
-      check_internal_node (tb->tb_sb, tb->R[h], "BAD R");
-  }
+static void check_internal_levels(struct tree_balance *tb)
+{
+	int h;
+
+	/* check all internal nodes */
+	for (h = 1; tb->insert_size[h]; h++) {
+		check_internal_node(tb->tb_sb, PATH_H_PBUFFER(tb->tb_path, h),
+				    "BAD BUFFER ON PATH");
+		if (tb->lnum[h])
+			check_internal_node(tb->tb_sb, tb->L[h], "BAD L");
+		if (tb->rnum[h])
+			check_internal_node(tb->tb_sb, tb->R[h], "BAD R");
+	}
 
 }
 
 #endif
 
-
-
-
-
-
 /* Now we have all of the buffers that must be used in balancing of
    the tree.  We rely on the assumption that schedule() will not occur
    while do_balance works. ( Only interrupt handlers are acceptable.)
@@ -1484,114 +2029,109 @@ static void check_internal_levels (struct tree_balance * tb)
 
 */
 
-static inline void do_balance_starts (struct tree_balance *tb)
+static inline void do_balance_starts(struct tree_balance *tb)
 {
-    /* use print_cur_tb() to see initial state of struct
-       tree_balance */
+	/* use print_cur_tb() to see initial state of struct
+	   tree_balance */
 
-    /* store_print_tb (tb); */
+	/* store_print_tb (tb); */
 
-    /* do not delete, just comment it out */
+	/* do not delete, just comment it out */
 /*    print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, 
 	     "check");*/
-    RFALSE( check_before_balancing (tb), "PAP-12340: locked buffers in TB");
+	RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
 #ifdef CONFIG_REISERFS_CHECK
-    cur_tb = tb;
+	cur_tb = tb;
 #endif
 }
 
-
-static inline void do_balance_completed (struct tree_balance * tb)
+static inline void do_balance_completed(struct tree_balance *tb)
 {
-    
+
 #ifdef CONFIG_REISERFS_CHECK
-    check_leaf_level (tb);
-    check_internal_levels (tb);
-    cur_tb = NULL;
+	check_leaf_level(tb);
+	check_internal_levels(tb);
+	cur_tb = NULL;
 #endif
 
-    /* reiserfs_free_block is no longer schedule safe.  So, we need to
-    ** put the buffers we want freed on the thrown list during do_balance,
-    ** and then free them now
-    */
-
-    REISERFS_SB(tb->tb_sb)->s_do_balance ++;
+	/* reiserfs_free_block is no longer schedule safe.  So, we need to
+	 ** put the buffers we want freed on the thrown list during do_balance,
+	 ** and then free them now
+	 */
 
+	REISERFS_SB(tb->tb_sb)->s_do_balance++;
 
-    /* release all nodes hold to perform the balancing */
-    unfix_nodes(tb);
+	/* release all nodes hold to perform the balancing */
+	unfix_nodes(tb);
 
-    free_thrown(tb) ;
+	free_thrown(tb);
 }
 
+void do_balance(struct tree_balance *tb,	/* tree_balance structure */
+		struct item_head *ih,	/* item header of inserted item */
+		const char *body,	/* body  of inserted item or bytes to paste */
+		int flag)
+{				/* i - insert, d - delete
+				   c - cut, p - paste
+
+				   Cut means delete part of an item
+				   (includes removing an entry from a
+				   directory).
+
+				   Delete means delete whole item.
+
+				   Insert means add a new item into the
+				   tree.
+
+				   Paste means to append to the end of an
+				   existing file or to insert a directory
+				   entry.  */
+	int child_pos,		/* position of a child node in its parent */
+	 h;			/* level of the tree being processed */
+	struct item_head insert_key[2];	/* in our processing of one level
+					   we sometimes determine what
+					   must be inserted into the next
+					   higher level.  This insertion
+					   consists of a key or two keys
+					   and their corresponding
+					   pointers */
+	struct buffer_head *insert_ptr[2];	/* inserted node-ptrs for the next
+						   level */
+
+	tb->tb_mode = flag;
+	tb->need_balance_dirty = 0;
+
+	if (FILESYSTEM_CHANGED_TB(tb)) {
+		reiserfs_panic(tb->tb_sb,
+			       "clm-6000: do_balance, fs generation has changed\n");
+	}
+	/* if we have no real work to do  */
+	if (!tb->insert_size[0]) {
+		reiserfs_warning(tb->tb_sb,
+				 "PAP-12350: do_balance: insert_size == 0, mode == %c",
+				 flag);
+		unfix_nodes(tb);
+		return;
+	}
 
+	atomic_inc(&(fs_generation(tb->tb_sb)));
+	do_balance_starts(tb);
 
-
-
-void do_balance (struct tree_balance * tb, /* tree_balance structure */
-		 struct item_head * ih,	   /* item header of inserted item */
-		 const char * body,  /* body  of inserted item or bytes to paste */
-		 int flag)  /* i - insert, d - delete
-			       c - cut, p - paste
-						      
-			       Cut means delete part of an item
-			       (includes removing an entry from a
-			       directory).
-						      
-			       Delete means delete whole item.
-						      
-			       Insert means add a new item into the
-			       tree.
-						      						      
-			       Paste means to append to the end of an
-			       existing file or to insert a directory
-			       entry.  */
-{
-    int child_pos, /* position of a child node in its parent */
-	h;	   /* level of the tree being processed */
-    struct item_head insert_key[2]; /* in our processing of one level
-				       we sometimes determine what
-				       must be inserted into the next
-				       higher level.  This insertion
-				       consists of a key or two keys
-				       and their corresponding
-				       pointers */
-    struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next
-					  level */
-
-    tb->tb_mode = flag;
-    tb->need_balance_dirty = 0;
-
-    if (FILESYSTEM_CHANGED_TB(tb)) {
-        reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ;
-    }
-    /* if we have no real work to do  */
-    if ( ! tb->insert_size[0] ) {
-	reiserfs_warning (tb->tb_sb,
-			  "PAP-12350: do_balance: insert_size == 0, mode == %c",
-			  flag);
-	unfix_nodes(tb);
-	return;
-    }
-
-    atomic_inc (&(fs_generation (tb->tb_sb)));
-    do_balance_starts (tb);
-    
 	/* balance leaf returns 0 except if combining L R and S into
 	   one node.  see balance_internal() for explanation of this
-	   line of code.*/
-	child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) +
-	  balance_leaf (tb, ih, body, flag, insert_key, insert_ptr);
+	   line of code. */
+	child_pos = PATH_H_B_ITEM_ORDER(tb->tb_path, 0) +
+	    balance_leaf(tb, ih, body, flag, insert_key, insert_ptr);
 
 #ifdef CONFIG_REISERFS_CHECK
-    check_after_balance_leaf (tb);
+	check_after_balance_leaf(tb);
 #endif
 
-    /* Balance internal level of the tree. */
-    for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ )
-	child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr);
-
+	/* Balance internal level of the tree. */
+	for (h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++)
+		child_pos =
+		    balance_internal(tb, h, child_pos, insert_key, insert_ptr);
 
-    do_balance_completed (tb);
+	do_balance_completed(tb);
 
 }
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 12e91209544e..c9f178fb494f 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -2,7 +2,6 @@
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
 
-
 #include <linux/time.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/reiserfs_acl.h>
@@ -31,82 +30,84 @@
 ** We use reiserfs_truncate_file to pack the tail, since it already has
 ** all the conditions coded.  
 */
-static int reiserfs_file_release (struct inode * inode, struct file * filp)
+static int reiserfs_file_release(struct inode *inode, struct file *filp)
 {
 
-    struct reiserfs_transaction_handle th ;
-    int err;
-    int jbegin_failure = 0;
+	struct reiserfs_transaction_handle th;
+	int err;
+	int jbegin_failure = 0;
 
-    if (!S_ISREG (inode->i_mode))
-	BUG ();
+	if (!S_ISREG(inode->i_mode))
+		BUG();
 
-    /* fast out for when nothing needs to be done */
-    if ((atomic_read(&inode->i_count) > 1 ||
-	!(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) || 
-         !tail_has_to_be_packed(inode))       && 
-	REISERFS_I(inode)->i_prealloc_count <= 0) {
-	return 0;
-    }    
-    
-    reiserfs_write_lock(inode->i_sb);
-    down (&inode->i_sem); 
-    /* freeing preallocation only involves relogging blocks that
-     * are already in the current transaction.  preallocation gets
-     * freed at the end of each transaction, so it is impossible for
-     * us to log any additional blocks (including quota blocks)
-     */
-    err = journal_begin(&th, inode->i_sb, 1);
-    if (err) {
-	/* uh oh, we can't allow the inode to go away while there
-	 * is still preallocation blocks pending.  Try to join the
-	 * aborted transaction
-	 */
-	jbegin_failure = err;
-	err = journal_join_abort(&th, inode->i_sb, 1);
+	/* fast out for when nothing needs to be done */
+	if ((atomic_read(&inode->i_count) > 1 ||
+	     !(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) ||
+	     !tail_has_to_be_packed(inode)) &&
+	    REISERFS_I(inode)->i_prealloc_count <= 0) {
+		return 0;
+	}
 
+	reiserfs_write_lock(inode->i_sb);
+	down(&inode->i_sem);
+	/* freeing preallocation only involves relogging blocks that
+	 * are already in the current transaction.  preallocation gets
+	 * freed at the end of each transaction, so it is impossible for
+	 * us to log any additional blocks (including quota blocks)
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
 	if (err) {
-	    /* hmpf, our choices here aren't good.  We can pin the inode
-	     * which will disallow unmount from every happening, we can
-	     * do nothing, which will corrupt random memory on unmount,
-	     * or we can forcibly remove the file from the preallocation
-	     * list, which will leak blocks on disk.  Lets pin the inode
-	     * and let the admin know what is going on.
-	     */
-	    igrab(inode);
-	    reiserfs_warning(inode->i_sb, "pinning inode %lu because the "
-	                     "preallocation can't be freed");
-	    goto out;
+		/* uh oh, we can't allow the inode to go away while there
+		 * is still preallocation blocks pending.  Try to join the
+		 * aborted transaction
+		 */
+		jbegin_failure = err;
+		err = journal_join_abort(&th, inode->i_sb, 1);
+
+		if (err) {
+			/* hmpf, our choices here aren't good.  We can pin the inode
+			 * which will disallow unmount from every happening, we can
+			 * do nothing, which will corrupt random memory on unmount,
+			 * or we can forcibly remove the file from the preallocation
+			 * list, which will leak blocks on disk.  Lets pin the inode
+			 * and let the admin know what is going on.
+			 */
+			igrab(inode);
+			reiserfs_warning(inode->i_sb,
+					 "pinning inode %lu because the "
+					 "preallocation can't be freed");
+			goto out;
+		}
 	}
-    }
-    reiserfs_update_inode_transaction(inode) ;
+	reiserfs_update_inode_transaction(inode);
 
 #ifdef REISERFS_PREALLOCATE
-    reiserfs_discard_prealloc (&th, inode);
+	reiserfs_discard_prealloc(&th, inode);
 #endif
-    err = journal_end(&th, inode->i_sb, 1);
-
-    /* copy back the error code from journal_begin */
-    if (!err)
-        err = jbegin_failure;
-
-    if (!err && atomic_read(&inode->i_count) <= 1 &&
-	(REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
-        tail_has_to_be_packed (inode)) {
-	/* if regular file is released by last holder and it has been
-	   appended (we append by unformatted node only) or its direct
-	   item(s) had to be converted, then it may have to be
-	   indirect2direct converted */
-	err = reiserfs_truncate_file(inode, 0) ;
-    }
-out:
-    up (&inode->i_sem); 
-    reiserfs_write_unlock(inode->i_sb);
-    return err;
+	err = journal_end(&th, inode->i_sb, 1);
+
+	/* copy back the error code from journal_begin */
+	if (!err)
+		err = jbegin_failure;
+
+	if (!err && atomic_read(&inode->i_count) <= 1 &&
+	    (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) &&
+	    tail_has_to_be_packed(inode)) {
+		/* if regular file is released by last holder and it has been
+		   appended (we append by unformatted node only) or its direct
+		   item(s) had to be converted, then it may have to be
+		   indirect2direct converted */
+		err = reiserfs_truncate_file(inode, 0);
+	}
+      out:
+	up(&inode->i_sem);
+	reiserfs_write_unlock(inode->i_sb);
+	return err;
 }
 
-static void reiserfs_vfs_truncate_file(struct inode *inode) {
-    reiserfs_truncate_file(inode, 1) ;
+static void reiserfs_vfs_truncate_file(struct inode *inode)
+{
+	reiserfs_truncate_file(inode, 1);
 }
 
 /* Sync a reiserfs file. */
@@ -116,26 +117,24 @@ static void reiserfs_vfs_truncate_file(struct inode *inode) {
  * be removed...
  */
 
-static int reiserfs_sync_file(
-			      struct file   * p_s_filp,
-			      struct dentry * p_s_dentry,
-			      int datasync
-			      ) {
-  struct inode * p_s_inode = p_s_dentry->d_inode;
-  int n_err;
-  int barrier_done;
-
-  if (!S_ISREG(p_s_inode->i_mode))
-      BUG ();
-  n_err = sync_mapping_buffers(p_s_inode->i_mapping) ;
-  reiserfs_write_lock(p_s_inode->i_sb);
-  barrier_done = reiserfs_commit_for_inode(p_s_inode);
-  reiserfs_write_unlock(p_s_inode->i_sb);
-  if (barrier_done != 1)
-      blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
-  if (barrier_done < 0)
-    return barrier_done;
-  return ( n_err < 0 ) ? -EIO : 0;
+static int reiserfs_sync_file(struct file *p_s_filp,
+			      struct dentry *p_s_dentry, int datasync)
+{
+	struct inode *p_s_inode = p_s_dentry->d_inode;
+	int n_err;
+	int barrier_done;
+
+	if (!S_ISREG(p_s_inode->i_mode))
+		BUG();
+	n_err = sync_mapping_buffers(p_s_inode->i_mapping);
+	reiserfs_write_lock(p_s_inode->i_sb);
+	barrier_done = reiserfs_commit_for_inode(p_s_inode);
+	reiserfs_write_unlock(p_s_inode->i_sb);
+	if (barrier_done != 1)
+		blkdev_issue_flush(p_s_inode->i_sb->s_bdev, NULL);
+	if (barrier_done < 0)
+		return barrier_done;
+	return (n_err < 0) ? -EIO : 0;
 }
 
 /* I really do not want to play with memory shortage right now, so
@@ -147,700 +146,797 @@ static int reiserfs_sync_file(
 /* Allocates blocks for a file to fulfil write request.
    Maps all unmapped but prepared pages from the list.
    Updates metadata with newly allocated blocknumbers as needed */
-static int reiserfs_allocate_blocks_for_region(
-				struct reiserfs_transaction_handle *th,
-				struct inode *inode, /* Inode we work with */
-				loff_t pos, /* Writing position */
-				int num_pages, /* number of pages write going
-						  to touch */
-				int write_bytes, /* amount of bytes to write */
-				struct page **prepared_pages, /* array of
-							         prepared pages
-							       */
-				int blocks_to_allocate /* Amount of blocks we
-							  need to allocate to
-							  fit the data into file
-							 */
-				)
+static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode we work with */
+					       loff_t pos,	/* Writing position */
+					       int num_pages,	/* number of pages write going
+								   to touch */
+					       int write_bytes,	/* amount of bytes to write */
+					       struct page **prepared_pages,	/* array of
+										   prepared pages
+										 */
+					       int blocks_to_allocate	/* Amount of blocks we
+									   need to allocate to
+									   fit the data into file
+									 */
+    )
 {
-    struct cpu_key key; // cpu key of item that we are going to deal with
-    struct item_head *ih; // pointer to item head that we are going to deal with
-    struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
-    __le32 * item; // pointer to item we are going to deal with
-    INITIALIZE_PATH(path); // path to item, that we are going to deal with.
-    b_blocknr_t *allocated_blocks; // Pointer to a place where allocated blocknumbers would be stored.
-    reiserfs_blocknr_hint_t hint; // hint structure for block allocator.
-    size_t res; // return value of various functions that we call.
-    int curr_block; // current block used to keep track of unmapped blocks.
-    int i; // loop counter
-    int itempos; // position in item
-    unsigned int from = (pos & (PAGE_CACHE_SIZE - 1)); // writing position in
-						       // first page
-    unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1; /* last modified byte offset in last page */
-    __u64 hole_size ; // amount of blocks for a file hole, if it needed to be created.
-    int modifying_this_item = 0; // Flag for items traversal code to keep track
-				 // of the fact that we already prepared
-				 // current block for journal
-    int will_prealloc = 0;
-    RFALSE(!blocks_to_allocate, "green-9004: tried to allocate zero blocks?");
-
-    /* only preallocate if this is a small write */
-    if (REISERFS_I(inode)->i_prealloc_count ||
-       (!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
-        blocks_to_allocate <
-        REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
-        will_prealloc = REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
-
-    allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
-    					sizeof(b_blocknr_t), GFP_NOFS);
-
-    /* First we compose a key to point at the writing position, we want to do
-       that outside of any locking region. */
-    make_cpu_key (&key, inode, pos+1, TYPE_ANY, 3/*key length*/);
-
-    /* If we came here, it means we absolutely need to open a transaction,
-       since we need to allocate some blocks */
-    reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb)); // Wish I know if this number enough
-    if (res)
-        goto error_exit;
-    reiserfs_update_inode_transaction(inode) ;
-
-    /* Look for the in-tree position of our write, need path for block allocator */
-    res = search_for_position_by_key(inode->i_sb, &key, &path);
-    if ( res == IO_ERROR ) {
-	res = -EIO;
-	goto error_exit;
-    }
-   
-    /* Allocate blocks */
-    /* First fill in "hint" structure for block allocator */
-    hint.th = th; // transaction handle.
-    hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
-    hint.inode = inode; // Inode is needed by block allocator too.
-    hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
-    hint.key = key.on_disk_key; // on disk key of file.
-    hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already.
-    hint.formatted_node = 0; // We are allocating blocks for unformatted node.
-    hint.preallocate = will_prealloc;
-
-    /* Call block allocator to allocate blocks */
-    res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
-    if ( res != CARRY_ON ) {
-	if ( res == NO_DISK_SPACE ) {
-	    /* We flush the transaction in case of no space. This way some
-	       blocks might become free */
-	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
-	    res = restart_transaction(th, inode, &path);
-            if (res)
-                goto error_exit;
-
-	    /* We might have scheduled, so search again */
-	    res = search_for_position_by_key(inode->i_sb, &key, &path);
-	    if ( res == IO_ERROR ) {
-		res = -EIO;
+	struct cpu_key key;	// cpu key of item that we are going to deal with
+	struct item_head *ih;	// pointer to item head that we are going to deal with
+	struct buffer_head *bh;	// Buffer head that contains items that we are going to deal with
+	__le32 *item;		// pointer to item we are going to deal with
+	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
+	b_blocknr_t *allocated_blocks;	// Pointer to a place where allocated blocknumbers would be stored.
+	reiserfs_blocknr_hint_t hint;	// hint structure for block allocator.
+	size_t res;		// return value of various functions that we call.
+	int curr_block;		// current block used to keep track of unmapped blocks.
+	int i;			// loop counter
+	int itempos;		// position in item
+	unsigned int from = (pos & (PAGE_CACHE_SIZE - 1));	// writing position in
+	// first page
+	unsigned int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;	/* last modified byte offset in last page */
+	__u64 hole_size;	// amount of blocks for a file hole, if it needed to be created.
+	int modifying_this_item = 0;	// Flag for items traversal code to keep track
+	// of the fact that we already prepared
+	// current block for journal
+	int will_prealloc = 0;
+	RFALSE(!blocks_to_allocate,
+	       "green-9004: tried to allocate zero blocks?");
+
+	/* only preallocate if this is a small write */
+	if (REISERFS_I(inode)->i_prealloc_count ||
+	    (!(write_bytes & (inode->i_sb->s_blocksize - 1)) &&
+	     blocks_to_allocate <
+	     REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
+		will_prealloc =
+		    REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize;
+
+	allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) *
+				   sizeof(b_blocknr_t), GFP_NOFS);
+
+	/* First we compose a key to point at the writing position, we want to do
+	   that outside of any locking region. */
+	make_cpu_key(&key, inode, pos + 1, TYPE_ANY, 3 /*key length */ );
+
+	/* If we came here, it means we absolutely need to open a transaction,
+	   since we need to allocate some blocks */
+	reiserfs_write_lock(inode->i_sb);	// Journaling stuff and we need that.
+	res = journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));	// Wish I know if this number enough
+	if (res)
 		goto error_exit;
-	    }
+	reiserfs_update_inode_transaction(inode);
 
-	    /* update changed info for hint structure. */
-	    res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
-	    if ( res != CARRY_ON ) {
-		res = -ENOSPC; 
-		pathrelse(&path);
+	/* Look for the in-tree position of our write, need path for block allocator */
+	res = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (res == IO_ERROR) {
+		res = -EIO;
 		goto error_exit;
-	    }
-	} else {
-	    res = -ENOSPC;
-	    pathrelse(&path);
-	    goto error_exit;
 	}
-    }
 
-#ifdef __BIG_ENDIAN
-        // Too bad, I have not found any way to convert a given region from
-        // cpu format to little endian format
-    {
-        int i;
-        for ( i = 0; i < blocks_to_allocate ; i++)
-            allocated_blocks[i]=cpu_to_le32(allocated_blocks[i]);
-    }
-#endif
-
-    /* Blocks allocating well might have scheduled and tree might have changed,
-       let's search the tree again */
-    /* find where in the tree our write should go */
-    res = search_for_position_by_key(inode->i_sb, &key, &path);
-    if ( res == IO_ERROR ) {
-	res = -EIO;
-	goto error_exit_free_blocks;
-    }
-
-    bh = get_last_bh( &path ); // Get a bufferhead for last element in path.
-    ih = get_ih( &path );      // Get a pointer to last item head in path.
-    item = get_item( &path );  // Get a pointer to last item in path
-
-    /* Let's see what we have found */
-    if ( res != POSITION_FOUND ) { /* position not found, this means that we
-				      might need to append file with holes
-				      first */
-	// Since we are writing past the file's end, we need to find out if
-	// there is a hole that needs to be inserted before our writing
-	// position, and how many blocks it is going to cover (we need to
-	//  populate pointers to file blocks representing the hole with zeros)
+	/* Allocate blocks */
+	/* First fill in "hint" structure for block allocator */
+	hint.th = th;		// transaction handle.
+	hint.path = &path;	// Path, so that block allocator can determine packing locality or whatever it needs to determine.
+	hint.inode = inode;	// Inode is needed by block allocator too.
+	hint.search_start = 0;	// We have no hint on where to search free blocks for block allocator.
+	hint.key = key.on_disk_key;	// on disk key of file.
+	hint.block = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);	// Number of disk blocks this file occupies already.
+	hint.formatted_node = 0;	// We are allocating blocks for unformatted node.
+	hint.preallocate = will_prealloc;
+
+	/* Call block allocator to allocate blocks */
+	res =
+	    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
+				       blocks_to_allocate, blocks_to_allocate);
+	if (res != CARRY_ON) {
+		if (res == NO_DISK_SPACE) {
+			/* We flush the transaction in case of no space. This way some
+			   blocks might become free */
+			SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
+			res = restart_transaction(th, inode, &path);
+			if (res)
+				goto error_exit;
+
+			/* We might have scheduled, so search again */
+			res =
+			    search_for_position_by_key(inode->i_sb, &key,
+						       &path);
+			if (res == IO_ERROR) {
+				res = -EIO;
+				goto error_exit;
+			}
 
+			/* update changed info for hint structure. */
+			res =
+			    reiserfs_allocate_blocknrs(&hint, allocated_blocks,
+						       blocks_to_allocate,
+						       blocks_to_allocate);
+			if (res != CARRY_ON) {
+				res = -ENOSPC;
+				pathrelse(&path);
+				goto error_exit;
+			}
+		} else {
+			res = -ENOSPC;
+			pathrelse(&path);
+			goto error_exit;
+		}
+	}
+#ifdef __BIG_ENDIAN
+	// Too bad, I have not found any way to convert a given region from
+	// cpu format to little endian format
 	{
-	    int item_offset = 1;
-	    /*
-	     * if ih is stat data, its offset is 0 and we don't want to
-	     * add 1 to pos in the hole_size calculation
-	     */
-	    if (is_statdata_le_ih(ih))
-	        item_offset = 0;
-	    hole_size = (pos + item_offset -
-	            (le_key_k_offset( get_inode_item_key_version(inode),
-		    &(ih->ih_key)) +
-		    op_bytes_number(ih, inode->i_sb->s_blocksize))) >>
-		    inode->i_sb->s_blocksize_bits;
+		int i;
+		for (i = 0; i < blocks_to_allocate; i++)
+			allocated_blocks[i] = cpu_to_le32(allocated_blocks[i]);
 	}
+#endif
 
-	if ( hole_size > 0 ) {
-	    int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time.
-	    /* area filled with zeroes, to supply as list of zero blocknumbers
-	       We allocate it outside of loop just in case loop would spin for
-	       several iterations. */
-	    char *zeros = kmalloc(to_paste*UNFM_P_SIZE, GFP_ATOMIC); // We cannot insert more than MAX_ITEM_LEN bytes anyway.
-	    if ( !zeros ) {
-		res = -ENOMEM;
+	/* Blocks allocating well might have scheduled and tree might have changed,
+	   let's search the tree again */
+	/* find where in the tree our write should go */
+	res = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (res == IO_ERROR) {
+		res = -EIO;
 		goto error_exit_free_blocks;
-	    }
-	    memset ( zeros, 0, to_paste*UNFM_P_SIZE);
-	    do {
-		to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE );
-		if ( is_indirect_le_ih(ih) ) {
-		    /* Ok, there is existing indirect item already. Need to append it */
-		    /* Calculate position past inserted item */
-		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-		    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste);
-		    if ( res ) {
-			kfree(zeros);
-			goto error_exit_free_blocks;
-		    }
-		} else if ( is_statdata_le_ih(ih) ) {
-		    /* No existing item, create it */
-		    /* item head for new item */
-		    struct item_head ins_ih;
-
-		    /* create a key for our new item */
-		    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3);
-
-		    /* Create new item head for our new item */
-		    make_le_item_head (&ins_ih, &key, key.version, 1,
-				       TYPE_INDIRECT, to_paste*UNFM_P_SIZE,
-				       0 /* free space */);
-
-		    /* Find where such item should live in the tree */
-		    res = search_item (inode->i_sb, &key, &path);
-		    if ( res != ITEM_NOT_FOUND ) {
-			/* item should not exist, otherwise we have error */
-			if ( res != -ENOSPC ) {
-			    reiserfs_warning (inode->i_sb,
-				"green-9008: search_by_key (%K) returned %d",
-					      &key, res);
+	}
+
+	bh = get_last_bh(&path);	// Get a bufferhead for last element in path.
+	ih = get_ih(&path);	// Get a pointer to last item head in path.
+	item = get_item(&path);	// Get a pointer to last item in path
+
+	/* Let's see what we have found */
+	if (res != POSITION_FOUND) {	/* position not found, this means that we
+					   might need to append file with holes
+					   first */
+		// Since we are writing past the file's end, we need to find out if
+		// there is a hole that needs to be inserted before our writing
+		// position, and how many blocks it is going to cover (we need to
+		//  populate pointers to file blocks representing the hole with zeros)
+
+		{
+			int item_offset = 1;
+			/*
+			 * if ih is stat data, its offset is 0 and we don't want to
+			 * add 1 to pos in the hole_size calculation
+			 */
+			if (is_statdata_le_ih(ih))
+				item_offset = 0;
+			hole_size = (pos + item_offset -
+				     (le_key_k_offset
+				      (get_inode_item_key_version(inode),
+				       &(ih->ih_key)) + op_bytes_number(ih,
+									inode->
+									i_sb->
+									s_blocksize)))
+			    >> inode->i_sb->s_blocksize_bits;
+		}
+
+		if (hole_size > 0) {
+			int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize) / UNFM_P_SIZE);	// How much data to insert first time.
+			/* area filled with zeroes, to supply as list of zero blocknumbers
+			   We allocate it outside of loop just in case loop would spin for
+			   several iterations. */
+			char *zeros = kmalloc(to_paste * UNFM_P_SIZE, GFP_ATOMIC);	// We cannot insert more than MAX_ITEM_LEN bytes anyway.
+			if (!zeros) {
+				res = -ENOMEM;
+				goto error_exit_free_blocks;
 			}
-			res = -EIO;
-		        kfree(zeros);
-			goto error_exit_free_blocks;
-		    }
-		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros);
-		} else {
-		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
+			memset(zeros, 0, to_paste * UNFM_P_SIZE);
+			do {
+				to_paste =
+				    min_t(__u64, hole_size,
+					  MAX_ITEM_LEN(inode->i_sb->
+						       s_blocksize) /
+					  UNFM_P_SIZE);
+				if (is_indirect_le_ih(ih)) {
+					/* Ok, there is existing indirect item already. Need to append it */
+					/* Calculate position past inserted item */
+					make_cpu_key(&key, inode,
+						     le_key_k_offset
+						     (get_inode_item_key_version
+						      (inode),
+						      &(ih->ih_key)) +
+						     op_bytes_number(ih,
+								     inode->
+								     i_sb->
+								     s_blocksize),
+						     TYPE_INDIRECT, 3);
+					res =
+					    reiserfs_paste_into_item(th, &path,
+								     &key,
+								     inode,
+								     (char *)
+								     zeros,
+								     UNFM_P_SIZE
+								     *
+								     to_paste);
+					if (res) {
+						kfree(zeros);
+						goto error_exit_free_blocks;
+					}
+				} else if (is_statdata_le_ih(ih)) {
+					/* No existing item, create it */
+					/* item head for new item */
+					struct item_head ins_ih;
+
+					/* create a key for our new item */
+					make_cpu_key(&key, inode, 1,
+						     TYPE_INDIRECT, 3);
+
+					/* Create new item head for our new item */
+					make_le_item_head(&ins_ih, &key,
+							  key.version, 1,
+							  TYPE_INDIRECT,
+							  to_paste *
+							  UNFM_P_SIZE,
+							  0 /* free space */ );
+
+					/* Find where such item should live in the tree */
+					res =
+					    search_item(inode->i_sb, &key,
+							&path);
+					if (res != ITEM_NOT_FOUND) {
+						/* item should not exist, otherwise we have error */
+						if (res != -ENOSPC) {
+							reiserfs_warning(inode->
+									 i_sb,
+									 "green-9008: search_by_key (%K) returned %d",
+									 &key,
+									 res);
+						}
+						res = -EIO;
+						kfree(zeros);
+						goto error_exit_free_blocks;
+					}
+					res =
+					    reiserfs_insert_item(th, &path,
+								 &key, &ins_ih,
+								 inode,
+								 (char *)zeros);
+				} else {
+					reiserfs_panic(inode->i_sb,
+						       "green-9011: Unexpected key type %K\n",
+						       &key);
+				}
+				if (res) {
+					kfree(zeros);
+					goto error_exit_free_blocks;
+				}
+				/* Now we want to check if transaction is too full, and if it is
+				   we restart it. This will also free the path. */
+				if (journal_transaction_should_end
+				    (th, th->t_blocks_allocated)) {
+					res =
+					    restart_transaction(th, inode,
+								&path);
+					if (res) {
+						pathrelse(&path);
+						kfree(zeros);
+						goto error_exit;
+					}
+				}
+
+				/* Well, need to recalculate path and stuff */
+				set_cpu_key_k_offset(&key,
+						     cpu_key_k_offset(&key) +
+						     (to_paste << inode->
+						      i_blkbits));
+				res =
+				    search_for_position_by_key(inode->i_sb,
+							       &key, &path);
+				if (res == IO_ERROR) {
+					res = -EIO;
+					kfree(zeros);
+					goto error_exit_free_blocks;
+				}
+				bh = get_last_bh(&path);
+				ih = get_ih(&path);
+				item = get_item(&path);
+				hole_size -= to_paste;
+			} while (hole_size);
+			kfree(zeros);
 		}
-		if ( res ) {
-		    kfree(zeros);
-		    goto error_exit_free_blocks;
+	}
+	// Go through existing indirect items first
+	// replace all zeroes with blocknumbers from list
+	// Note that if no corresponding item was found, by previous search,
+	// it means there are no existing in-tree representation for file area
+	// we are going to overwrite, so there is nothing to scan through for holes.
+	for (curr_block = 0, itempos = path.pos_in_item;
+	     curr_block < blocks_to_allocate && res == POSITION_FOUND;) {
+	      retry:
+
+		if (itempos >= ih_item_len(ih) / UNFM_P_SIZE) {
+			/* We run out of data in this indirect item, let's look for another
+			   one. */
+			/* First if we are already modifying current item, log it */
+			if (modifying_this_item) {
+				journal_mark_dirty(th, inode->i_sb, bh);
+				modifying_this_item = 0;
+			}
+			/* Then set the key to look for a new indirect item (offset of old
+			   item is added to old item length */
+			set_cpu_key_k_offset(&key,
+					     le_key_k_offset
+					     (get_inode_item_key_version(inode),
+					      &(ih->ih_key)) +
+					     op_bytes_number(ih,
+							     inode->i_sb->
+							     s_blocksize));
+			/* Search ofor position of new key in the tree. */
+			res =
+			    search_for_position_by_key(inode->i_sb, &key,
+						       &path);
+			if (res == IO_ERROR) {
+				res = -EIO;
+				goto error_exit_free_blocks;
+			}
+			bh = get_last_bh(&path);
+			ih = get_ih(&path);
+			item = get_item(&path);
+			itempos = path.pos_in_item;
+			continue;	// loop to check all kinds of conditions and so on.
 		}
-		/* Now we want to check if transaction is too full, and if it is
-		   we restart it. This will also free the path. */
-		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-		    res = restart_transaction(th, inode, &path);
-                    if (res) {
-                        pathrelse (&path);
-                        kfree(zeros);
-                        goto error_exit;
-                    }
-                }
-
-		/* Well, need to recalculate path and stuff */
-		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
-		res = search_for_position_by_key(inode->i_sb, &key, &path);
-		if ( res == IO_ERROR ) {
-		    res = -EIO;
-		    kfree(zeros);
-		    goto error_exit_free_blocks;
+		/* Ok, we have correct position in item now, so let's see if it is
+		   representing file hole (blocknumber is zero) and fill it if needed */
+		if (!item[itempos]) {
+			/* Ok, a hole. Now we need to check if we already prepared this
+			   block to be journaled */
+			while (!modifying_this_item) {	// loop until succeed
+				/* Well, this item is not journaled yet, so we must prepare
+				   it for journal first, before we can change it */
+				struct item_head tmp_ih;	// We copy item head of found item,
+				// here to detect if fs changed under
+				// us while we were preparing for
+				// journal.
+				int fs_gen;	// We store fs generation here to find if someone
+				// changes fs under our feet
+
+				copy_item_head(&tmp_ih, ih);	// Remember itemhead
+				fs_gen = get_generation(inode->i_sb);	// remember fs generation
+				reiserfs_prepare_for_journal(inode->i_sb, bh, 1);	// Prepare a buffer within which indirect item is stored for changing.
+				if (fs_changed(fs_gen, inode->i_sb)
+				    && item_moved(&tmp_ih, &path)) {
+					// Sigh, fs was changed under us, we need to look for new
+					// location of item we are working with
+
+					/* unmark prepaerd area as journaled and search for it's
+					   new position */
+					reiserfs_restore_prepared_buffer(inode->
+									 i_sb,
+									 bh);
+					res =
+					    search_for_position_by_key(inode->
+								       i_sb,
+								       &key,
+								       &path);
+					if (res == IO_ERROR) {
+						res = -EIO;
+						goto error_exit_free_blocks;
+					}
+					bh = get_last_bh(&path);
+					ih = get_ih(&path);
+					item = get_item(&path);
+					itempos = path.pos_in_item;
+					goto retry;
+				}
+				modifying_this_item = 1;
+			}
+			item[itempos] = allocated_blocks[curr_block];	// Assign new block
+			curr_block++;
 		}
-		bh=get_last_bh(&path);
-		ih=get_ih(&path);
-		item = get_item(&path);
-		hole_size -= to_paste;
-	    } while ( hole_size );
-	    kfree(zeros);
+		itempos++;
 	}
-    }
-
-    // Go through existing indirect items first
-    // replace all zeroes with blocknumbers from list
-    // Note that if no corresponding item was found, by previous search,
-    // it means there are no existing in-tree representation for file area
-    // we are going to overwrite, so there is nothing to scan through for holes.
-    for ( curr_block = 0, itempos = path.pos_in_item ; curr_block < blocks_to_allocate && res == POSITION_FOUND ; ) {
-retry:
-
-	if ( itempos >= ih_item_len(ih)/UNFM_P_SIZE ) {
-	    /* We run out of data in this indirect item, let's look for another
-	       one. */
-	    /* First if we are already modifying current item, log it */
-	    if ( modifying_this_item ) {
-		journal_mark_dirty (th, inode->i_sb, bh);
-		modifying_this_item = 0;
-	    }
-	    /* Then set the key to look for a new indirect item (offset of old
-	       item is added to old item length */
-	    set_cpu_key_k_offset( &key, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize));
-	    /* Search ofor position of new key in the tree. */
-	    res = search_for_position_by_key(inode->i_sb, &key, &path);
-	    if ( res == IO_ERROR) {
-		res = -EIO;
-		goto error_exit_free_blocks;
-	    }
-	    bh=get_last_bh(&path);
-	    ih=get_ih(&path);
-	    item = get_item(&path);
-	    itempos = path.pos_in_item;
-	    continue; // loop to check all kinds of conditions and so on.
+
+	if (modifying_this_item) {	// We need to log last-accessed block, if it
+		// was modified, but not logged yet.
+		journal_mark_dirty(th, inode->i_sb, bh);
 	}
-	/* Ok, we have correct position in item now, so let's see if it is
-	   representing file hole (blocknumber is zero) and fill it if needed */
-	if ( !item[itempos] ) {
-	    /* Ok, a hole. Now we need to check if we already prepared this
-	       block to be journaled */
-	    while ( !modifying_this_item ) { // loop until succeed
-		/* Well, this item is not journaled yet, so we must prepare
-		   it for journal first, before we can change it */
-		struct item_head tmp_ih; // We copy item head of found item,
-					 // here to detect if fs changed under
-					 // us while we were preparing for
-					 // journal.
-		int fs_gen; // We store fs generation here to find if someone
-			    // changes fs under our feet
-
-		copy_item_head (&tmp_ih, ih); // Remember itemhead
-		fs_gen = get_generation (inode->i_sb); // remember fs generation
-		reiserfs_prepare_for_journal(inode->i_sb, bh, 1); // Prepare a buffer within which indirect item is stored for changing.
-		if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
-		    // Sigh, fs was changed under us, we need to look for new
-		    // location of item we are working with
-
-		    /* unmark prepaerd area as journaled and search for it's
-		       new position */
-		    reiserfs_restore_prepared_buffer(inode->i_sb, bh);
-		    res = search_for_position_by_key(inode->i_sb, &key, &path);
-		    if ( res == IO_ERROR) {
-			res = -EIO;
-			goto error_exit_free_blocks;
-		    }
-		    bh=get_last_bh(&path);
-		    ih=get_ih(&path);
-		    item = get_item(&path);
-		    itempos = path.pos_in_item;
-		    goto retry;
+
+	if (curr_block < blocks_to_allocate) {
+		// Oh, well need to append to indirect item, or to create indirect item
+		// if there weren't any
+		if (is_indirect_le_ih(ih)) {
+			// Existing indirect item - append. First calculate key for append
+			// position. We do not need to recalculate path as it should
+			// already point to correct place.
+			make_cpu_key(&key, inode,
+				     le_key_k_offset(get_inode_item_key_version
+						     (inode),
+						     &(ih->ih_key)) +
+				     op_bytes_number(ih,
+						     inode->i_sb->s_blocksize),
+				     TYPE_INDIRECT, 3);
+			res =
+			    reiserfs_paste_into_item(th, &path, &key, inode,
+						     (char *)(allocated_blocks +
+							      curr_block),
+						     UNFM_P_SIZE *
+						     (blocks_to_allocate -
+						      curr_block));
+			if (res) {
+				goto error_exit_free_blocks;
+			}
+		} else if (is_statdata_le_ih(ih)) {
+			// Last found item was statdata. That means we need to create indirect item.
+			struct item_head ins_ih;	/* itemhead for new item */
+
+			/* create a key for our new item */
+			make_cpu_key(&key, inode, 1, TYPE_INDIRECT, 3);	// Position one,
+			// because that's
+			// where first
+			// indirect item
+			// begins
+			/* Create new item head for our new item */
+			make_le_item_head(&ins_ih, &key, key.version, 1,
+					  TYPE_INDIRECT,
+					  (blocks_to_allocate -
+					   curr_block) * UNFM_P_SIZE,
+					  0 /* free space */ );
+			/* Find where such item should live in the tree */
+			res = search_item(inode->i_sb, &key, &path);
+			if (res != ITEM_NOT_FOUND) {
+				/* Well, if we have found such item already, or some error
+				   occured, we need to warn user and return error */
+				if (res != -ENOSPC) {
+					reiserfs_warning(inode->i_sb,
+							 "green-9009: search_by_key (%K) "
+							 "returned %d", &key,
+							 res);
+				}
+				res = -EIO;
+				goto error_exit_free_blocks;
+			}
+			/* Insert item into the tree with the data as its body */
+			res =
+			    reiserfs_insert_item(th, &path, &key, &ins_ih,
+						 inode,
+						 (char *)(allocated_blocks +
+							  curr_block));
+		} else {
+			reiserfs_panic(inode->i_sb,
+				       "green-9010: unexpected item type for key %K\n",
+				       &key);
 		}
-		modifying_this_item = 1;
-	    }
-	    item[itempos] = allocated_blocks[curr_block]; // Assign new block
-	    curr_block++;
 	}
-	itempos++;
-    }
-
-    if ( modifying_this_item ) { // We need to log last-accessed block, if it
-				 // was modified, but not logged yet.
-	journal_mark_dirty (th, inode->i_sb, bh);
-    }
-
-    if ( curr_block < blocks_to_allocate ) {
-	// Oh, well need to append to indirect item, or to create indirect item
-	// if there weren't any
-	if ( is_indirect_le_ih(ih) ) {
-	    // Existing indirect item - append. First calculate key for append
-	    // position. We do not need to recalculate path as it should
-	    // already point to correct place.
-	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-	    res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
-	    if ( res ) {
-		goto error_exit_free_blocks;
-	    }
-	} else if (is_statdata_le_ih(ih) ) {
-	    // Last found item was statdata. That means we need to create indirect item.
-	    struct item_head ins_ih; /* itemhead for new item */
-
-	    /* create a key for our new item */
-	    make_cpu_key( &key, inode, 1, TYPE_INDIRECT, 3); // Position one,
-							    // because that's
-							    // where first
-							    // indirect item
-							    // begins
-	    /* Create new item head for our new item */
-	    make_le_item_head (&ins_ih, &key, key.version, 1, TYPE_INDIRECT,
-			       (blocks_to_allocate-curr_block)*UNFM_P_SIZE,
-			       0 /* free space */);
-	    /* Find where such item should live in the tree */
-	    res = search_item (inode->i_sb, &key, &path);
-	    if ( res != ITEM_NOT_FOUND ) {
-		/* Well, if we have found such item already, or some error
-		   occured, we need to warn user and return error */
-		if ( res != -ENOSPC ) {
-		    reiserfs_warning (inode->i_sb,
-				      "green-9009: search_by_key (%K) "
-				      "returned %d", &key, res);
+	// the caller is responsible for closing the transaction
+	// unless we return an error, they are also responsible for logging
+	// the inode.
+	//
+	pathrelse(&path);
+	/*
+	 * cleanup prellocation from previous writes
+	 * if this is a partial block write
+	 */
+	if (write_bytes & (inode->i_sb->s_blocksize - 1))
+		reiserfs_discard_prealloc(th, inode);
+	reiserfs_write_unlock(inode->i_sb);
+
+	// go through all the pages/buffers and map the buffers to newly allocated
+	// blocks (so that system knows where to write these pages later).
+	curr_block = 0;
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = prepared_pages[i];	//current page
+		struct buffer_head *head = page_buffers(page);	// first buffer for a page
+		int block_start, block_end;	// in-page offsets for buffers.
+
+		if (!page_buffers(page))
+			reiserfs_panic(inode->i_sb,
+				       "green-9005: No buffers for prepared page???");
+
+		/* For each buffer in page */
+		for (bh = head, block_start = 0; bh != head || !block_start;
+		     block_start = block_end, bh = bh->b_this_page) {
+			if (!bh)
+				reiserfs_panic(inode->i_sb,
+					       "green-9006: Allocated but absent buffer for a page?");
+			block_end = block_start + inode->i_sb->s_blocksize;
+			if (i == 0 && block_end <= from)
+				/* if this buffer is before requested data to map, skip it */
+				continue;
+			if (i == num_pages - 1 && block_start >= to)
+				/* If this buffer is after requested data to map, abort
+				   processing of current page */
+				break;
+
+			if (!buffer_mapped(bh)) {	// Ok, unmapped buffer, need to map it
+				map_bh(bh, inode->i_sb,
+				       le32_to_cpu(allocated_blocks
+						   [curr_block]));
+				curr_block++;
+				set_buffer_new(bh);
+			}
 		}
-		res = -EIO;
-		goto error_exit_free_blocks;
-	    }
-	    /* Insert item into the tree with the data as its body */
-	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block));
-	} else {
-	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
-	}
-    }
-
-    // the caller is responsible for closing the transaction
-    // unless we return an error, they are also responsible for logging
-    // the inode.
-    //
-    pathrelse(&path);
-    /*
-     * cleanup prellocation from previous writes
-     * if this is a partial block write
-     */
-    if (write_bytes & (inode->i_sb->s_blocksize -1))
-        reiserfs_discard_prealloc(th, inode);
-    reiserfs_write_unlock(inode->i_sb);
-
-    // go through all the pages/buffers and map the buffers to newly allocated
-    // blocks (so that system knows where to write these pages later).
-    curr_block = 0;
-    for ( i = 0; i < num_pages ; i++ ) {
-	struct page *page=prepared_pages[i]; //current page
-	struct buffer_head *head = page_buffers(page);// first buffer for a page
-	int block_start, block_end; // in-page offsets for buffers.
-
-	if (!page_buffers(page))
-	    reiserfs_panic(inode->i_sb, "green-9005: No buffers for prepared page???");
-
-	/* For each buffer in page */
-	for(bh = head, block_start = 0; bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
-	    if (!bh)
-		reiserfs_panic(inode->i_sb, "green-9006: Allocated but absent buffer for a page?");
-	    block_end = block_start+inode->i_sb->s_blocksize;
-	    if (i == 0 && block_end <= from )
-		/* if this buffer is before requested data to map, skip it */
-		continue;
-	    if (i == num_pages - 1 && block_start >= to)
-		/* If this buffer is after requested data to map, abort
-		   processing of current page */
-		break;
-
-	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
-		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
-		curr_block++;
-		set_buffer_new(bh);
-	    }
 	}
-    }
 
-    RFALSE( curr_block > blocks_to_allocate, "green-9007: Used too many blocks? weird");
+	RFALSE(curr_block > blocks_to_allocate,
+	       "green-9007: Used too many blocks? weird");
 
-    kfree(allocated_blocks);
-    return 0;
+	kfree(allocated_blocks);
+	return 0;
 
 // Need to deal with transaction here.
-error_exit_free_blocks:
-    pathrelse(&path);
-    // free blocks
-    for( i = 0; i < blocks_to_allocate; i++ )
-	reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1);
-
-error_exit:
-    if (th->t_trans_id) {
-        int err;
-        // update any changes we made to blk count
-        reiserfs_update_sd(th, inode);
-        err = journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
-        if (err)
-            res = err;
-    }
-    reiserfs_write_unlock(inode->i_sb);
-    kfree(allocated_blocks);
-
-    return res;
+      error_exit_free_blocks:
+	pathrelse(&path);
+	// free blocks
+	for (i = 0; i < blocks_to_allocate; i++)
+		reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]),
+				    1);
+
+      error_exit:
+	if (th->t_trans_id) {
+		int err;
+		// update any changes we made to blk count
+		reiserfs_update_sd(th, inode);
+		err =
+		    journal_end(th, inode->i_sb,
+				JOURNAL_PER_BALANCE_CNT * 3 + 1 +
+				2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb));
+		if (err)
+			res = err;
+	}
+	reiserfs_write_unlock(inode->i_sb);
+	kfree(allocated_blocks);
+
+	return res;
 }
 
 /* Unlock pages prepared by reiserfs_prepare_file_region_for_write */
-static void reiserfs_unprepare_pages(struct page **prepared_pages, /* list of locked pages */
-			      size_t num_pages /* amount of pages */) {
-    int i; // loop counter
+static void reiserfs_unprepare_pages(struct page **prepared_pages,	/* list of locked pages */
+				     size_t num_pages /* amount of pages */ )
+{
+	int i;			// loop counter
 
-    for (i=0; i < num_pages ; i++) {
-	struct page *page = prepared_pages[i];
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = prepared_pages[i];
 
-	try_to_free_buffers(page);
-	unlock_page(page);
-	page_cache_release(page);
-    }
+		try_to_free_buffers(page);
+		unlock_page(page);
+		page_cache_release(page);
+	}
 }
 
 /* This function will copy data from userspace to specified pages within
    supplied byte range */
-static int reiserfs_copy_from_user_to_file_region(
-				loff_t pos, /* In-file position */
-				int num_pages, /* Number of pages affected */
-				int write_bytes, /* Amount of bytes to write */
-				struct page **prepared_pages, /* pointer to 
-								 array to
-								 prepared pages
-								*/
-				const char __user *buf /* Pointer to user-supplied
-						   data*/
-				)
+static int reiserfs_copy_from_user_to_file_region(loff_t pos,	/* In-file position */
+						  int num_pages,	/* Number of pages affected */
+						  int write_bytes,	/* Amount of bytes to write */
+						  struct page **prepared_pages,	/* pointer to 
+										   array to
+										   prepared pages
+										 */
+						  const char __user * buf	/* Pointer to user-supplied
+										   data */
+    )
 {
-    long page_fault=0; // status of copy_from_user.
-    int i; // loop counter.
-    int offset; // offset in page
-
-    for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
-	size_t count = min_t(size_t,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
-	struct page *page=prepared_pages[i]; // Current page we process.
-
-	fault_in_pages_readable( buf, count);
-
-	/* Copy data from userspace to the current page */
-	kmap(page);
-	page_fault = __copy_from_user(page_address(page)+offset, buf, count); // Copy the data.
-	/* Flush processor's dcache for this page */
-	flush_dcache_page(page);
-	kunmap(page);
-	buf+=count;
-	write_bytes-=count;
-
-	if (page_fault)
-	    break; // Was there a fault? abort.
-    }
-
-    return page_fault?-EFAULT:0;
+	long page_fault = 0;	// status of copy_from_user.
+	int i;			// loop counter.
+	int offset;		// offset in page
+
+	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
+	     i++, offset = 0) {
+		size_t count = min_t(size_t, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
+		struct page *page = prepared_pages[i];	// Current page we process.
+
+		fault_in_pages_readable(buf, count);
+
+		/* Copy data from userspace to the current page */
+		kmap(page);
+		page_fault = __copy_from_user(page_address(page) + offset, buf, count);	// Copy the data.
+		/* Flush processor's dcache for this page */
+		flush_dcache_page(page);
+		kunmap(page);
+		buf += count;
+		write_bytes -= count;
+
+		if (page_fault)
+			break;	// Was there a fault? abort.
+	}
+
+	return page_fault ? -EFAULT : 0;
 }
 
 /* taken fs/buffer.c:__block_commit_write */
 int reiserfs_commit_page(struct inode *inode, struct page *page,
-		unsigned from, unsigned to)
+			 unsigned from, unsigned to)
 {
-    unsigned block_start, block_end;
-    int partial = 0;
-    unsigned blocksize;
-    struct buffer_head *bh, *head;
-    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
-    int new;
-    int logit = reiserfs_file_data_log(inode);
-    struct super_block *s = inode->i_sb;
-    int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
-    struct reiserfs_transaction_handle th;
-    int ret = 0;
-
-    th.t_trans_id = 0;
-    blocksize = 1 << inode->i_blkbits;
-
-    if (logit) {
-	reiserfs_write_lock(s);
-	ret = journal_begin(&th, s, bh_per_page + 1);
-	if (ret)
-	    goto drop_write_lock;
-	reiserfs_update_inode_transaction(inode);
-    }
-    for(bh = head = page_buffers(page), block_start = 0;
-        bh != head || !block_start;
-	block_start=block_end, bh = bh->b_this_page)
-    {
-
-	new = buffer_new(bh);
-	clear_buffer_new(bh);
-	block_end = block_start + blocksize;
-	if (block_end <= from || block_start >= to) {
-	    if (!buffer_uptodate(bh))
-		    partial = 1;
-	} else {
-	    set_buffer_uptodate(bh);
-	    if (logit) {
-		reiserfs_prepare_for_journal(s, bh, 1);
-		journal_mark_dirty(&th, s, bh);
-	    } else if (!buffer_dirty(bh)) {
-		mark_buffer_dirty(bh);
-		/* do data=ordered on any page past the end
-		 * of file and any buffer marked BH_New.
-		 */
-		if (reiserfs_data_ordered(inode->i_sb) &&
-		    (new || page->index >= i_size_index)) {
-		    reiserfs_add_ordered_list(inode, bh);
-	        }
-	    }
+	unsigned block_start, block_end;
+	int partial = 0;
+	unsigned blocksize;
+	struct buffer_head *bh, *head;
+	unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int new;
+	int logit = reiserfs_file_data_log(inode);
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	struct reiserfs_transaction_handle th;
+	int ret = 0;
+
+	th.t_trans_id = 0;
+	blocksize = 1 << inode->i_blkbits;
+
+	if (logit) {
+		reiserfs_write_lock(s);
+		ret = journal_begin(&th, s, bh_per_page + 1);
+		if (ret)
+			goto drop_write_lock;
+		reiserfs_update_inode_transaction(inode);
+	}
+	for (bh = head = page_buffers(page), block_start = 0;
+	     bh != head || !block_start;
+	     block_start = block_end, bh = bh->b_this_page) {
+
+		new = buffer_new(bh);
+		clear_buffer_new(bh);
+		block_end = block_start + blocksize;
+		if (block_end <= from || block_start >= to) {
+			if (!buffer_uptodate(bh))
+				partial = 1;
+		} else {
+			set_buffer_uptodate(bh);
+			if (logit) {
+				reiserfs_prepare_for_journal(s, bh, 1);
+				journal_mark_dirty(&th, s, bh);
+			} else if (!buffer_dirty(bh)) {
+				mark_buffer_dirty(bh);
+				/* do data=ordered on any page past the end
+				 * of file and any buffer marked BH_New.
+				 */
+				if (reiserfs_data_ordered(inode->i_sb) &&
+				    (new || page->index >= i_size_index)) {
+					reiserfs_add_ordered_list(inode, bh);
+				}
+			}
+		}
 	}
-    }
-    if (logit) {
-	ret = journal_end(&th, s, bh_per_page + 1);
-drop_write_lock:
-	reiserfs_write_unlock(s);
-    }
-    /*
-     * If this is a partial write which happened to make all buffers
-     * uptodate then we can optimize away a bogus readpage() for
-     * the next read(). Here we 'discover' whether the page went
-     * uptodate as a result of this (potentially partial) write.
-     */
-    if (!partial)
-	SetPageUptodate(page);
-    return ret;
+	if (logit) {
+		ret = journal_end(&th, s, bh_per_page + 1);
+	      drop_write_lock:
+		reiserfs_write_unlock(s);
+	}
+	/*
+	 * If this is a partial write which happened to make all buffers
+	 * uptodate then we can optimize away a bogus readpage() for
+	 * the next read(). Here we 'discover' whether the page went
+	 * uptodate as a result of this (potentially partial) write.
+	 */
+	if (!partial)
+		SetPageUptodate(page);
+	return ret;
 }
 
-
 /* Submit pages for write. This was separated from actual file copying
    because we might want to allocate block numbers in-between.
    This function assumes that caller will adjust file size to correct value. */
-static int reiserfs_submit_file_region_for_write(
-				struct reiserfs_transaction_handle *th,
-				struct inode *inode,
-				loff_t pos, /* Writing position offset */
-				size_t num_pages, /* Number of pages to write */
-				size_t write_bytes, /* number of bytes to write */
-				struct page **prepared_pages /* list of pages */
-				)
+static int reiserfs_submit_file_region_for_write(struct reiserfs_transaction_handle *th, struct inode *inode, loff_t pos,	/* Writing position offset */
+						 size_t num_pages,	/* Number of pages to write */
+						 size_t write_bytes,	/* number of bytes to write */
+						 struct page **prepared_pages	/* list of pages */
+    )
 {
-    int status; // return status of block_commit_write.
-    int retval = 0; // Return value we are going to return.
-    int i; // loop counter
-    int offset; // Writing offset in page.
-    int orig_write_bytes = write_bytes;
-    int sd_update = 0;
-
-    for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
-	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
-	struct page *page=prepared_pages[i]; // Current page we process.
-
-	status = reiserfs_commit_page(inode, page, offset, offset+count);
-	if ( status )
-	    retval = status; // To not overcomplicate matters We are going to
-			     // submit all the pages even if there was error.
-			     // we only remember error status to report it on
-			     // exit.
-	write_bytes-=count;
-    }
-    /* now that we've gotten all the ordered buffers marked dirty,
-     * we can safely update i_size and close any running transaction
-     */
-    if ( pos + orig_write_bytes > inode->i_size) {
-	inode->i_size = pos + orig_write_bytes; // Set new size
-	/* If the file have grown so much that tail packing is no
-	 * longer possible, reset "need to pack" flag */
-	if ( (have_large_tails (inode->i_sb) &&
-	      inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) &&
-	     inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-        else if ( (have_large_tails (inode->i_sb) &&
-	          inode->i_size < i_block_size (inode)*4) ||
-	          (have_small_tails (inode->i_sb) &&
-		  inode->i_size < i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
-
+	int status;		// return status of block_commit_write.
+	int retval = 0;		// Return value we are going to return.
+	int i;			// loop counter
+	int offset;		// Writing offset in page.
+	int orig_write_bytes = write_bytes;
+	int sd_update = 0;
+
+	for (i = 0, offset = (pos & (PAGE_CACHE_SIZE - 1)); i < num_pages;
+	     i++, offset = 0) {
+		int count = min_t(int, PAGE_CACHE_SIZE - offset, write_bytes);	// How much of bytes to write to this page
+		struct page *page = prepared_pages[i];	// Current page we process.
+
+		status =
+		    reiserfs_commit_page(inode, page, offset, offset + count);
+		if (status)
+			retval = status;	// To not overcomplicate matters We are going to
+		// submit all the pages even if there was error.
+		// we only remember error status to report it on
+		// exit.
+		write_bytes -= count;
+	}
+	/* now that we've gotten all the ordered buffers marked dirty,
+	 * we can safely update i_size and close any running transaction
+	 */
+	if (pos + orig_write_bytes > inode->i_size) {
+		inode->i_size = pos + orig_write_bytes;	// Set new size
+		/* If the file have grown so much that tail packing is no
+		 * longer possible, reset "need to pack" flag */
+		if ((have_large_tails(inode->i_sb) &&
+		     inode->i_size > i_block_size(inode) * 4) ||
+		    (have_small_tails(inode->i_sb) &&
+		     inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+		else if ((have_large_tails(inode->i_sb) &&
+			  inode->i_size < i_block_size(inode) * 4) ||
+			 (have_small_tails(inode->i_sb) &&
+			  inode->i_size < i_block_size(inode)))
+			REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
+
+		if (th->t_trans_id) {
+			reiserfs_write_lock(inode->i_sb);
+			reiserfs_update_sd(th, inode);	// And update on-disk metadata
+			reiserfs_write_unlock(inode->i_sb);
+		} else
+			inode->i_sb->s_op->dirty_inode(inode);
+
+		sd_update = 1;
+	}
 	if (th->t_trans_id) {
-	    reiserfs_write_lock(inode->i_sb);
-	    reiserfs_update_sd(th, inode); // And update on-disk metadata
-	    reiserfs_write_unlock(inode->i_sb);
-	} else
-	    inode->i_sb->s_op->dirty_inode(inode);
+		reiserfs_write_lock(inode->i_sb);
+		if (!sd_update)
+			reiserfs_update_sd(th, inode);
+		status = journal_end(th, th->t_super, th->t_blocks_allocated);
+		if (status)
+			retval = status;
+		reiserfs_write_unlock(inode->i_sb);
+	}
+	th->t_trans_id = 0;
 
-        sd_update = 1;
-    }
-    if (th->t_trans_id) {
-	reiserfs_write_lock(inode->i_sb);
-	if (!sd_update)
-	    reiserfs_update_sd(th, inode);
-	status = journal_end(th, th->t_super, th->t_blocks_allocated);
-        if (status)
-            retval = status;
-	reiserfs_write_unlock(inode->i_sb);
-    }
-    th->t_trans_id = 0;
-
-    /* 
-     * we have to unlock the pages after updating i_size, otherwise
-     * we race with writepage
-     */
-    for ( i = 0; i < num_pages ; i++) {
-	struct page *page=prepared_pages[i];
-	unlock_page(page); 
-	mark_page_accessed(page);
-	page_cache_release(page);
-    }
-    return retval;
+	/* 
+	 * we have to unlock the pages after updating i_size, otherwise
+	 * we race with writepage
+	 */
+	for (i = 0; i < num_pages; i++) {
+		struct page *page = prepared_pages[i];
+		unlock_page(page);
+		mark_page_accessed(page);
+		page_cache_release(page);
+	}
+	return retval;
 }
 
 /* Look if passed writing region is going to touch file's tail
    (if it is present). And if it is, convert the tail to unformatted node */
-static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to deal with */
-					 loff_t pos, /* Writing position */
-					 int write_bytes /* amount of bytes to write */
-				        )
+static int reiserfs_check_for_tail_and_convert(struct inode *inode,	/* inode to deal with */
+					       loff_t pos,	/* Writing position */
+					       int write_bytes	/* amount of bytes to write */
+    )
 {
-    INITIALIZE_PATH(path); // needed for search_for_position
-    struct cpu_key key; // Key that would represent last touched writing byte.
-    struct item_head *ih; // item header of found block;
-    int res; // Return value of various functions we call.
-    int cont_expand_offset; // We will put offset for generic_cont_expand here
-			    // This can be int just because tails are created
-			    // only for small files.
- 
+	INITIALIZE_PATH(path);	// needed for search_for_position
+	struct cpu_key key;	// Key that would represent last touched writing byte.
+	struct item_head *ih;	// item header of found block;
+	int res;		// Return value of various functions we call.
+	int cont_expand_offset;	// We will put offset for generic_cont_expand here
+	// This can be int just because tails are created
+	// only for small files.
+
 /* this embodies a dependency on a particular tail policy */
-    if ( inode->i_size >= inode->i_sb->s_blocksize*4 ) {
-	/* such a big files do not have tails, so we won't bother ourselves
-	   to look for tails, simply return */
-	return 0;
-    }
-
-    reiserfs_write_lock(inode->i_sb);
-    /* find the item containing the last byte to be written, or if
-     * writing past the end of the file then the last item of the
-     * file (and then we check its type). */
-    make_cpu_key (&key, inode, pos+write_bytes+1, TYPE_ANY, 3/*key length*/);
-    res = search_for_position_by_key(inode->i_sb, &key, &path);
-    if ( res == IO_ERROR ) {
-        reiserfs_write_unlock(inode->i_sb);
-	return -EIO;
-    }
-    ih = get_ih(&path);
-    res = 0;
-    if ( is_direct_le_ih(ih) ) {
-	/* Ok, closest item is file tail (tails are stored in "direct"
-	 * items), so we need to unpack it. */
-	/* To not overcomplicate matters, we just call generic_cont_expand
-	   which will in turn call other stuff and finally will boil down to
-	    reiserfs_get_block() that would do necessary conversion. */
-	cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key));
-	pathrelse(&path);
-	res = generic_cont_expand( inode, cont_expand_offset);
-    } else
-	pathrelse(&path);
+	if (inode->i_size >= inode->i_sb->s_blocksize * 4) {
+		/* such a big files do not have tails, so we won't bother ourselves
+		   to look for tails, simply return */
+		return 0;
+	}
 
-    reiserfs_write_unlock(inode->i_sb);
-    return res;
+	reiserfs_write_lock(inode->i_sb);
+	/* find the item containing the last byte to be written, or if
+	 * writing past the end of the file then the last item of the
+	 * file (and then we check its type). */
+	make_cpu_key(&key, inode, pos + write_bytes + 1, TYPE_ANY,
+		     3 /*key length */ );
+	res = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (res == IO_ERROR) {
+		reiserfs_write_unlock(inode->i_sb);
+		return -EIO;
+	}
+	ih = get_ih(&path);
+	res = 0;
+	if (is_direct_le_ih(ih)) {
+		/* Ok, closest item is file tail (tails are stored in "direct"
+		 * items), so we need to unpack it. */
+		/* To not overcomplicate matters, we just call generic_cont_expand
+		   which will in turn call other stuff and finally will boil down to
+		   reiserfs_get_block() that would do necessary conversion. */
+		cont_expand_offset =
+		    le_key_k_offset(get_inode_item_key_version(inode),
+				    &(ih->ih_key));
+		pathrelse(&path);
+		res = generic_cont_expand(inode, cont_expand_offset);
+	} else
+		pathrelse(&path);
+
+	reiserfs_write_unlock(inode->i_sb);
+	return res;
 }
 
 /* This function locks pages starting from @pos for @inode.
@@ -851,275 +947,296 @@ static int reiserfs_check_for_tail_and_convert( struct inode *inode, /* inode to
    append), it is zeroed, then. 
    Returns number of unallocated blocks that should be allocated to cover
    new file data.*/
-static int reiserfs_prepare_file_region_for_write(
-				struct inode *inode /* Inode of the file */,
-				loff_t pos, /* position in the file */
-				size_t num_pages, /* number of pages to
-					          prepare */
-				size_t write_bytes, /* Amount of bytes to be
-						    overwritten from
-						    @pos */
-				struct page **prepared_pages /* pointer to array
-							       where to store
-							       prepared pages */
-					   )
+static int reiserfs_prepare_file_region_for_write(struct inode *inode
+						  /* Inode of the file */ ,
+						  loff_t pos,	/* position in the file */
+						  size_t num_pages,	/* number of pages to
+									   prepare */
+						  size_t write_bytes,	/* Amount of bytes to be
+									   overwritten from
+									   @pos */
+						  struct page **prepared_pages	/* pointer to array
+										   where to store
+										   prepared pages */
+    )
 {
-    int res=0; // Return values of different functions we call.
-    unsigned long index = pos >> PAGE_CACHE_SHIFT; // Offset in file in pages.
-    int from = (pos & (PAGE_CACHE_SIZE - 1)); // Writing offset in first page
-    int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
-					 /* offset of last modified byte in last
-				            page */
-    struct address_space *mapping = inode->i_mapping; // Pages are mapped here.
-    int i; // Simple counter
-    int blocks = 0; /* Return value (blocks that should be allocated) */
-    struct buffer_head *bh, *head; // Current bufferhead and first bufferhead
-				   // of a page.
-    unsigned block_start, block_end; // Starting and ending offsets of current
-				     // buffer in the page.
-    struct buffer_head *wait[2], **wait_bh=wait; // Buffers for page, if
-						 // Page appeared to be not up
-						 // to date. Note how we have
-						 // at most 2 buffers, this is
-						 // because we at most may
-						 // partially overwrite two
-						 // buffers for one page. One at                                                 // the beginning of write area
-						 // and one at the end.
-						 // Everything inthe middle gets                                                 // overwritten totally.
-
-    struct cpu_key key; // cpu key of item that we are going to deal with
-    struct item_head *ih = NULL; // pointer to item head that we are going to deal with
-    struct buffer_head *itembuf=NULL; // Buffer head that contains items that we are going to deal with
-    INITIALIZE_PATH(path); // path to item, that we are going to deal with.
-    __le32 * item=NULL; // pointer to item we are going to deal with
-    int item_pos=-1; /* Position in indirect item */
-
-
-    if ( num_pages < 1 ) {
-	reiserfs_warning (inode->i_sb,
-			  "green-9001: reiserfs_prepare_file_region_for_write "
-			  "called with zero number of pages to process");
-	return -EFAULT;
-    }
-
-    /* We have 2 loops for pages. In first loop we grab and lock the pages, so
-       that nobody would touch these until we release the pages. Then
-       we'd start to deal with mapping buffers to blocks. */
-    for ( i = 0; i < num_pages; i++) {
-	prepared_pages[i] = grab_cache_page(mapping, index + i); // locks the page
-	if ( !prepared_pages[i]) {
-	    res = -ENOMEM;
-	    goto failed_page_grabbing;
-	}
-	if (!page_has_buffers(prepared_pages[i]))
-	    create_empty_buffers(prepared_pages[i], inode->i_sb->s_blocksize, 0);
-    }
-
-    /* Let's count amount of blocks for a case where all the blocks
-       overwritten are new (we will substract already allocated blocks later)*/
-    if ( num_pages > 2 )
-	/* These are full-overwritten pages so we count all the blocks in
-	   these pages are counted as needed to be allocated */
-	blocks = (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-
-    /* count blocks needed for first page (possibly partially written) */
-    blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) +
-	   !!(from & (inode->i_sb->s_blocksize-1)); /* roundup */
-
-    /* Now we account for last page. If last page == first page (we
-       overwrite only one page), we substract all the blocks past the
-       last writing position in a page out of already calculated number
-       of blocks */
-    blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT-inode->i_blkbits)) -
-	   ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
-	   /* Note how we do not roundup here since partial blocks still
-		   should be allocated */
-
-    /* Now if all the write area lies past the file end, no point in
-       maping blocks, since there is none, so we just zero out remaining
-       parts of first and last pages in write area (if needed) */
-    if ( (pos & ~((loff_t)PAGE_CACHE_SIZE - 1)) > inode->i_size ) {
-	if ( from != 0 ) {/* First page needs to be partially zeroed */
-	    char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
-	    memset(kaddr, 0, from);
-	    kunmap_atomic( kaddr, KM_USER0);
-	}
-	if ( to != PAGE_CACHE_SIZE ) { /* Last page needs to be partially zeroed */
-	    char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
-	    memset(kaddr+to, 0, PAGE_CACHE_SIZE - to);
-	    kunmap_atomic( kaddr, KM_USER0);
+	int res = 0;		// Return values of different functions we call.
+	unsigned long index = pos >> PAGE_CACHE_SHIFT;	// Offset in file in pages.
+	int from = (pos & (PAGE_CACHE_SIZE - 1));	// Writing offset in first page
+	int to = ((pos + write_bytes - 1) & (PAGE_CACHE_SIZE - 1)) + 1;
+	/* offset of last modified byte in last
+	   page */
+	struct address_space *mapping = inode->i_mapping;	// Pages are mapped here.
+	int i;			// Simple counter
+	int blocks = 0;		/* Return value (blocks that should be allocated) */
+	struct buffer_head *bh, *head;	// Current bufferhead and first bufferhead
+	// of a page.
+	unsigned block_start, block_end;	// Starting and ending offsets of current
+	// buffer in the page.
+	struct buffer_head *wait[2], **wait_bh = wait;	// Buffers for page, if
+	// Page appeared to be not up
+	// to date. Note how we have
+	// at most 2 buffers, this is
+	// because we at most may
+	// partially overwrite two
+	// buffers for one page. One at                                                 // the beginning of write area
+	// and one at the end.
+	// Everything inthe middle gets                                                 // overwritten totally.
+
+	struct cpu_key key;	// cpu key of item that we are going to deal with
+	struct item_head *ih = NULL;	// pointer to item head that we are going to deal with
+	struct buffer_head *itembuf = NULL;	// Buffer head that contains items that we are going to deal with
+	INITIALIZE_PATH(path);	// path to item, that we are going to deal with.
+	__le32 *item = NULL;	// pointer to item we are going to deal with
+	int item_pos = -1;	/* Position in indirect item */
+
+	if (num_pages < 1) {
+		reiserfs_warning(inode->i_sb,
+				 "green-9001: reiserfs_prepare_file_region_for_write "
+				 "called with zero number of pages to process");
+		return -EFAULT;
 	}
 
-	/* Since all blocks are new - use already calculated value */
-	return blocks;
-    }
-
-    /* Well, since we write somewhere into the middle of a file, there is
-       possibility we are writing over some already allocated blocks, so
-       let's map these blocks and substract number of such blocks out of blocks
-       we need to allocate (calculated above) */
-    /* Mask write position to start on blocksize, we do it out of the
-       loop for performance reasons */
-    pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
-    /* Set cpu key to the starting position in a file (on left block boundary)*/
-    make_cpu_key (&key, inode, 1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)), TYPE_ANY, 3/*key length*/);
-
-    reiserfs_write_lock(inode->i_sb); // We need that for at least search_by_key()
-    for ( i = 0; i < num_pages ; i++ ) { 
-
-	head = page_buffers(prepared_pages[i]);
-	/* For each buffer in the page */
-	for(bh = head, block_start = 0; bh != head || !block_start;
-	    block_start=block_end, bh = bh->b_this_page) {
-		if (!bh)
-		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
-		/* Find where this buffer ends */
-		block_end = block_start+inode->i_sb->s_blocksize;
-		if (i == 0 && block_end <= from )
-		    /* if this buffer is before requested data to map, skip it*/
-		    continue;
-
-		if (i == num_pages - 1 && block_start >= to) {
-		    /* If this buffer is after requested data to map, abort
-		       processing of current page */
-		    break;
+	/* We have 2 loops for pages. In first loop we grab and lock the pages, so
+	   that nobody would touch these until we release the pages. Then
+	   we'd start to deal with mapping buffers to blocks. */
+	for (i = 0; i < num_pages; i++) {
+		prepared_pages[i] = grab_cache_page(mapping, index + i);	// locks the page
+		if (!prepared_pages[i]) {
+			res = -ENOMEM;
+			goto failed_page_grabbing;
 		}
+		if (!page_has_buffers(prepared_pages[i]))
+			create_empty_buffers(prepared_pages[i],
+					     inode->i_sb->s_blocksize, 0);
+	}
 
-		if ( buffer_mapped(bh) && bh->b_blocknr !=0 ) {
-		    /* This is optimisation for a case where buffer is mapped
-		       and have blocknumber assigned. In case significant amount
-		       of such buffers are present, we may avoid some amount
-		       of search_by_key calls.
-		       Probably it would be possible to move parts of this code
-		       out of BKL, but I afraid that would overcomplicate code
-		       without any noticeable benefit.
-		    */
-		    item_pos++;
-		    /* Update the key */
-		    set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
-		    blocks--; // Decrease the amount of blocks that need to be
-			      // allocated
-		    continue; // Go to the next buffer
+	/* Let's count amount of blocks for a case where all the blocks
+	   overwritten are new (we will substract already allocated blocks later) */
+	if (num_pages > 2)
+		/* These are full-overwritten pages so we count all the blocks in
+		   these pages are counted as needed to be allocated */
+		blocks =
+		    (num_pages - 2) << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+
+	/* count blocks needed for first page (possibly partially written) */
+	blocks += ((PAGE_CACHE_SIZE - from) >> inode->i_blkbits) + !!(from & (inode->i_sb->s_blocksize - 1));	/* roundup */
+
+	/* Now we account for last page. If last page == first page (we
+	   overwrite only one page), we substract all the blocks past the
+	   last writing position in a page out of already calculated number
+	   of blocks */
+	blocks += ((num_pages > 1) << (PAGE_CACHE_SHIFT - inode->i_blkbits)) -
+	    ((PAGE_CACHE_SIZE - to) >> inode->i_blkbits);
+	/* Note how we do not roundup here since partial blocks still
+	   should be allocated */
+
+	/* Now if all the write area lies past the file end, no point in
+	   maping blocks, since there is none, so we just zero out remaining
+	   parts of first and last pages in write area (if needed) */
+	if ((pos & ~((loff_t) PAGE_CACHE_SIZE - 1)) > inode->i_size) {
+		if (from != 0) {	/* First page needs to be partially zeroed */
+			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
+			memset(kaddr, 0, from);
+			kunmap_atomic(kaddr, KM_USER0);
+		}
+		if (to != PAGE_CACHE_SIZE) {	/* Last page needs to be partially zeroed */
+			char *kaddr =
+			    kmap_atomic(prepared_pages[num_pages - 1],
+					KM_USER0);
+			memset(kaddr + to, 0, PAGE_CACHE_SIZE - to);
+			kunmap_atomic(kaddr, KM_USER0);
 		}
 
-		if ( !itembuf || /* if first iteration */
-		     item_pos >= ih_item_len(ih)/UNFM_P_SIZE)
-					     { /* or if we progressed past the
-						  current unformatted_item */
-			/* Try to find next item */
-			res = search_for_position_by_key(inode->i_sb, &key, &path);
-			/* Abort if no more items */
-			if ( res != POSITION_FOUND ) {
-			    /* make sure later loops don't use this item */
-			    itembuf = NULL;
-			    item = NULL;
-			    break;
+		/* Since all blocks are new - use already calculated value */
+		return blocks;
+	}
+
+	/* Well, since we write somewhere into the middle of a file, there is
+	   possibility we are writing over some already allocated blocks, so
+	   let's map these blocks and substract number of such blocks out of blocks
+	   we need to allocate (calculated above) */
+	/* Mask write position to start on blocksize, we do it out of the
+	   loop for performance reasons */
+	pos &= ~((loff_t) inode->i_sb->s_blocksize - 1);
+	/* Set cpu key to the starting position in a file (on left block boundary) */
+	make_cpu_key(&key, inode,
+		     1 + ((pos) & ~((loff_t) inode->i_sb->s_blocksize - 1)),
+		     TYPE_ANY, 3 /*key length */ );
+
+	reiserfs_write_lock(inode->i_sb);	// We need that for at least search_by_key()
+	for (i = 0; i < num_pages; i++) {
+
+		head = page_buffers(prepared_pages[i]);
+		/* For each buffer in the page */
+		for (bh = head, block_start = 0; bh != head || !block_start;
+		     block_start = block_end, bh = bh->b_this_page) {
+			if (!bh)
+				reiserfs_panic(inode->i_sb,
+					       "green-9002: Allocated but absent buffer for a page?");
+			/* Find where this buffer ends */
+			block_end = block_start + inode->i_sb->s_blocksize;
+			if (i == 0 && block_end <= from)
+				/* if this buffer is before requested data to map, skip it */
+				continue;
+
+			if (i == num_pages - 1 && block_start >= to) {
+				/* If this buffer is after requested data to map, abort
+				   processing of current page */
+				break;
 			}
 
-			/* Update information about current indirect item */
-			itembuf = get_last_bh( &path );
-			ih = get_ih( &path );
-			item = get_item( &path );
-			item_pos = path.pos_in_item;
+			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+				/* This is optimisation for a case where buffer is mapped
+				   and have blocknumber assigned. In case significant amount
+				   of such buffers are present, we may avoid some amount
+				   of search_by_key calls.
+				   Probably it would be possible to move parts of this code
+				   out of BKL, but I afraid that would overcomplicate code
+				   without any noticeable benefit.
+				 */
+				item_pos++;
+				/* Update the key */
+				set_cpu_key_k_offset(&key,
+						     cpu_key_k_offset(&key) +
+						     inode->i_sb->s_blocksize);
+				blocks--;	// Decrease the amount of blocks that need to be
+				// allocated
+				continue;	// Go to the next buffer
+			}
 
-			RFALSE( !is_indirect_le_ih (ih), "green-9003: indirect item expected");
-		}
+			if (!itembuf ||	/* if first iteration */
+			    item_pos >= ih_item_len(ih) / UNFM_P_SIZE) {	/* or if we progressed past the
+										   current unformatted_item */
+				/* Try to find next item */
+				res =
+				    search_for_position_by_key(inode->i_sb,
+							       &key, &path);
+				/* Abort if no more items */
+				if (res != POSITION_FOUND) {
+					/* make sure later loops don't use this item */
+					itembuf = NULL;
+					item = NULL;
+					break;
+				}
+
+				/* Update information about current indirect item */
+				itembuf = get_last_bh(&path);
+				ih = get_ih(&path);
+				item = get_item(&path);
+				item_pos = path.pos_in_item;
+
+				RFALSE(!is_indirect_le_ih(ih),
+				       "green-9003: indirect item expected");
+			}
 
-		/* See if there is some block associated with the file
-		   at that position, map the buffer to this block */
-		if ( get_block_num(item,item_pos) ) {
-		    map_bh(bh, inode->i_sb, get_block_num(item,item_pos));
-		    blocks--; // Decrease the amount of blocks that need to be
-			      // allocated
+			/* See if there is some block associated with the file
+			   at that position, map the buffer to this block */
+			if (get_block_num(item, item_pos)) {
+				map_bh(bh, inode->i_sb,
+				       get_block_num(item, item_pos));
+				blocks--;	// Decrease the amount of blocks that need to be
+				// allocated
+			}
+			item_pos++;
+			/* Update the key */
+			set_cpu_key_k_offset(&key,
+					     cpu_key_k_offset(&key) +
+					     inode->i_sb->s_blocksize);
 		}
-		item_pos++;
-		/* Update the key */
-		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + inode->i_sb->s_blocksize);
 	}
-    }
-    pathrelse(&path); // Free the path
-    reiserfs_write_unlock(inode->i_sb);
+	pathrelse(&path);	// Free the path
+	reiserfs_write_unlock(inode->i_sb);
 
 	/* Now zero out unmappend buffers for the first and last pages of
 	   write area or issue read requests if page is mapped. */
 	/* First page, see if it is not uptodate */
-	if ( !PageUptodate(prepared_pages[0]) ) {
-	    head = page_buffers(prepared_pages[0]);
-
-	    /* For each buffer in page */
-	    for(bh = head, block_start = 0; bh != head || !block_start;
-		block_start=block_end, bh = bh->b_this_page) {
-
-		if (!bh)
-		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
-		/* Find where this buffer ends */
-		block_end = block_start+inode->i_sb->s_blocksize;
-		if ( block_end <= from )
-		    /* if this buffer is before requested data to map, skip it*/
-		    continue;
-		if ( block_start < from ) { /* Aha, our partial buffer */
-		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
-						  issue READ request for it to
-						  not loose data */
-			ll_rw_block(READ, 1, &bh);
-			*wait_bh++=bh;
-		    } else { /* Not mapped, zero it */
-			char *kaddr = kmap_atomic(prepared_pages[0], KM_USER0);
-			memset(kaddr+block_start, 0, from-block_start);
-			kunmap_atomic( kaddr, KM_USER0);
-			set_buffer_uptodate(bh);
-		    }
+	if (!PageUptodate(prepared_pages[0])) {
+		head = page_buffers(prepared_pages[0]);
+
+		/* For each buffer in page */
+		for (bh = head, block_start = 0; bh != head || !block_start;
+		     block_start = block_end, bh = bh->b_this_page) {
+
+			if (!bh)
+				reiserfs_panic(inode->i_sb,
+					       "green-9002: Allocated but absent buffer for a page?");
+			/* Find where this buffer ends */
+			block_end = block_start + inode->i_sb->s_blocksize;
+			if (block_end <= from)
+				/* if this buffer is before requested data to map, skip it */
+				continue;
+			if (block_start < from) {	/* Aha, our partial buffer */
+				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
+								   issue READ request for it to
+								   not loose data */
+					ll_rw_block(READ, 1, &bh);
+					*wait_bh++ = bh;
+				} else {	/* Not mapped, zero it */
+					char *kaddr =
+					    kmap_atomic(prepared_pages[0],
+							KM_USER0);
+					memset(kaddr + block_start, 0,
+					       from - block_start);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
+			}
 		}
-	    }
 	}
 
 	/* Last page, see if it is not uptodate, or if the last page is past the end of the file. */
-	if ( !PageUptodate(prepared_pages[num_pages-1]) || 
-	    ((pos+write_bytes)>>PAGE_CACHE_SHIFT) > (inode->i_size>>PAGE_CACHE_SHIFT) ) {
-	    head = page_buffers(prepared_pages[num_pages-1]);
-
-	    /* for each buffer in page */
-	    for(bh = head, block_start = 0; bh != head || !block_start;
-		block_start=block_end, bh = bh->b_this_page) {
-
-		if (!bh)
-		    reiserfs_panic(inode->i_sb, "green-9002: Allocated but absent buffer for a page?");
-		/* Find where this buffer ends */
-		block_end = block_start+inode->i_sb->s_blocksize;
-		if ( block_start >= to )
-		    /* if this buffer is after requested data to map, skip it*/
-		    break;
-		if ( block_end > to ) { /* Aha, our partial buffer */
-		    if ( buffer_mapped(bh) ) { /* If it is mapped, we need to
-						  issue READ request for it to
-						  not loose data */
-			ll_rw_block(READ, 1, &bh);
-			*wait_bh++=bh;
-		    } else { /* Not mapped, zero it */
-			char *kaddr = kmap_atomic(prepared_pages[num_pages-1], KM_USER0);
-			memset(kaddr+to, 0, block_end-to);
-			kunmap_atomic( kaddr, KM_USER0);
-			set_buffer_uptodate(bh);
-		    }
+	if (!PageUptodate(prepared_pages[num_pages - 1]) ||
+	    ((pos + write_bytes) >> PAGE_CACHE_SHIFT) >
+	    (inode->i_size >> PAGE_CACHE_SHIFT)) {
+		head = page_buffers(prepared_pages[num_pages - 1]);
+
+		/* for each buffer in page */
+		for (bh = head, block_start = 0; bh != head || !block_start;
+		     block_start = block_end, bh = bh->b_this_page) {
+
+			if (!bh)
+				reiserfs_panic(inode->i_sb,
+					       "green-9002: Allocated but absent buffer for a page?");
+			/* Find where this buffer ends */
+			block_end = block_start + inode->i_sb->s_blocksize;
+			if (block_start >= to)
+				/* if this buffer is after requested data to map, skip it */
+				break;
+			if (block_end > to) {	/* Aha, our partial buffer */
+				if (buffer_mapped(bh)) {	/* If it is mapped, we need to
+								   issue READ request for it to
+								   not loose data */
+					ll_rw_block(READ, 1, &bh);
+					*wait_bh++ = bh;
+				} else {	/* Not mapped, zero it */
+					char *kaddr =
+					    kmap_atomic(prepared_pages
+							[num_pages - 1],
+							KM_USER0);
+					memset(kaddr + to, 0, block_end - to);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
+			}
 		}
-	    }
 	}
 
-    /* Wait for read requests we made to happen, if necessary */
-    while(wait_bh > wait) {
-	wait_on_buffer(*--wait_bh);
-	if (!buffer_uptodate(*wait_bh)) {
-	    res = -EIO;
-	    goto failed_read;
+	/* Wait for read requests we made to happen, if necessary */
+	while (wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh)) {
+			res = -EIO;
+			goto failed_read;
+		}
 	}
-    }
-
-    return blocks;
-failed_page_grabbing:
-    num_pages = i;
-failed_read:
-    reiserfs_unprepare_pages(prepared_pages, num_pages);
-    return res;
+
+	return blocks;
+      failed_page_grabbing:
+	num_pages = i;
+      failed_read:
+	reiserfs_unprepare_pages(prepared_pages, num_pages);
+	return res;
 }
 
 /* Write @count bytes at position @ppos in a file indicated by @file
@@ -1148,262 +1265,305 @@ failed_read:
    Future Features: providing search_by_key with hints.
 
 */
-static ssize_t reiserfs_file_write( struct file *file, /* the file we are going to write into */
-                             const char __user *buf, /*  pointer to user supplied data
-(in userspace) */
-                             size_t count, /* amount of bytes to write */
-                             loff_t *ppos /* pointer to position in file that we start writing at. Should be updated to
-                                           * new current position before returning. */ )
+static ssize_t reiserfs_file_write(struct file *file,	/* the file we are going to write into */
+				   const char __user * buf,	/*  pointer to user supplied data
+								   (in userspace) */
+				   size_t count,	/* amount of bytes to write */
+				   loff_t * ppos	/* pointer to position in file that we start writing at. Should be updated to
+							 * new current position before returning. */
+				   )
 {
-    size_t already_written = 0; // Number of bytes already written to the file.
-    loff_t pos; // Current position in the file.
-    ssize_t res; // return value of various functions that we call.
-    int err = 0;
-    struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
-				/* To simplify coding at this time, we store
-				   locked pages in array for now */
-    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
-    struct reiserfs_transaction_handle th;
-    th.t_trans_id = 0;
-
-    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
-	ssize_t result, after_file_end = 0;
-	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
-	    /* If we are appending a file, we need to put this savelink in here.
-	       If we will crash while doing direct io, finish_unfinished will
-	       cut the garbage from the file end. */
-	    reiserfs_write_lock(inode->i_sb);
-	    err = journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
-            if (err) {
-		reiserfs_write_unlock (inode->i_sb);
-		return err;
-	    }
-	    reiserfs_update_inode_transaction(inode);
-	    add_save_link (&th, inode, 1 /* Truncate */);
-	    after_file_end = 1;
-	    err = journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT );
-            reiserfs_write_unlock(inode->i_sb);
-	    if (err)
-		return err;
-	}
-	result = generic_file_write(file, buf, count, ppos);
-
-	if ( after_file_end ) { /* Now update i_size and remove the savelink */
-	    struct reiserfs_transaction_handle th;
-	    reiserfs_write_lock(inode->i_sb);
-	    err = journal_begin(&th, inode->i_sb, 1);
-            if (err) {
-                reiserfs_write_unlock (inode->i_sb);
-                return err;
-            }
-	    reiserfs_update_inode_transaction(inode);
-	    reiserfs_update_sd(&th, inode);
-	    err = journal_end(&th, inode->i_sb, 1);
-            if (err) {
-                reiserfs_write_unlock (inode->i_sb);
-                return err;
-            }
-	    err = remove_save_link (inode, 1/* truncate */);
-	    reiserfs_write_unlock(inode->i_sb);
-            if (err)
-                return err;
-	}
-
-	return result;
-    }
-
-    if ( unlikely((ssize_t) count < 0 ))
-        return -EINVAL;
-
-    if (unlikely(!access_ok(VERIFY_READ, buf, count)))
-        return -EFAULT;
-
-    down(&inode->i_sem); // locks the entire file for just us
-
-    pos = *ppos;
-
-    /* Check if we can write to specified region of file, file
-       is not overly big and this kind of stuff. Adjust pos and
-       count, if needed */
-    res = generic_write_checks(file, &pos, &count, 0);
-    if (res)
-	goto out;
-
-    if ( count == 0 )
-	goto out;
-
-    res = remove_suid(file->f_dentry);
-    if (res)
-	goto out;
-
-    inode_update_time(inode, 1); /* Both mtime and ctime */
-
-    // Ok, we are done with all the checks.
+	size_t already_written = 0;	// Number of bytes already written to the file.
+	loff_t pos;		// Current position in the file.
+	ssize_t res;		// return value of various functions that we call.
+	int err = 0;
+	struct inode *inode = file->f_dentry->d_inode;	// Inode of the file that we are writing to.
+	/* To simplify coding at this time, we store
+	   locked pages in array for now */
+	struct page *prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
+
+	if (file->f_flags & O_DIRECT) {	// Direct IO needs treatment
+		ssize_t result, after_file_end = 0;
+		if ((*ppos + count >= inode->i_size)
+		    || (file->f_flags & O_APPEND)) {
+			/* If we are appending a file, we need to put this savelink in here.
+			   If we will crash while doing direct io, finish_unfinished will
+			   cut the garbage from the file end. */
+			reiserfs_write_lock(inode->i_sb);
+			err =
+			    journal_begin(&th, inode->i_sb,
+					  JOURNAL_PER_BALANCE_CNT);
+			if (err) {
+				reiserfs_write_unlock(inode->i_sb);
+				return err;
+			}
+			reiserfs_update_inode_transaction(inode);
+			add_save_link(&th, inode, 1 /* Truncate */ );
+			after_file_end = 1;
+			err =
+			    journal_end(&th, inode->i_sb,
+					JOURNAL_PER_BALANCE_CNT);
+			reiserfs_write_unlock(inode->i_sb);
+			if (err)
+				return err;
+		}
+		result = generic_file_write(file, buf, count, ppos);
+
+		if (after_file_end) {	/* Now update i_size and remove the savelink */
+			struct reiserfs_transaction_handle th;
+			reiserfs_write_lock(inode->i_sb);
+			err = journal_begin(&th, inode->i_sb, 1);
+			if (err) {
+				reiserfs_write_unlock(inode->i_sb);
+				return err;
+			}
+			reiserfs_update_inode_transaction(inode);
+			reiserfs_update_sd(&th, inode);
+			err = journal_end(&th, inode->i_sb, 1);
+			if (err) {
+				reiserfs_write_unlock(inode->i_sb);
+				return err;
+			}
+			err = remove_save_link(inode, 1 /* truncate */ );
+			reiserfs_write_unlock(inode->i_sb);
+			if (err)
+				return err;
+		}
 
-    // Now we should start real work
+		return result;
+	}
 
-    /* If we are going to write past the file's packed tail or if we are going
-       to overwrite part of the tail, we need that tail to be converted into
-       unformatted node */
-    res = reiserfs_check_for_tail_and_convert( inode, pos, count);
-    if (res)
-	goto out;
+	if (unlikely((ssize_t) count < 0))
+		return -EINVAL;
+
+	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
+		return -EFAULT;
+
+	down(&inode->i_sem);	// locks the entire file for just us
+
+	pos = *ppos;
+
+	/* Check if we can write to specified region of file, file
+	   is not overly big and this kind of stuff. Adjust pos and
+	   count, if needed */
+	res = generic_write_checks(file, &pos, &count, 0);
+	if (res)
+		goto out;
+
+	if (count == 0)
+		goto out;
+
+	res = remove_suid(file->f_dentry);
+	if (res)
+		goto out;
+
+	inode_update_time(inode, 1);	/* Both mtime and ctime */
+
+	// Ok, we are done with all the checks.
+
+	// Now we should start real work
+
+	/* If we are going to write past the file's packed tail or if we are going
+	   to overwrite part of the tail, we need that tail to be converted into
+	   unformatted node */
+	res = reiserfs_check_for_tail_and_convert(inode, pos, count);
+	if (res)
+		goto out;
+
+	while (count > 0) {
+		/* This is the main loop in which we running until some error occures
+		   or until we write all of the data. */
+		size_t num_pages;	/* amount of pages we are going to write this iteration */
+		size_t write_bytes;	/* amount of bytes to write during this iteration */
+		size_t blocks_to_allocate;	/* how much blocks we need to allocate for this iteration */
+
+		/*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos */
+		num_pages = !!((pos + count) & (PAGE_CACHE_SIZE - 1)) +	/* round up partial
+									   pages */
+		    ((count +
+		      (pos & (PAGE_CACHE_SIZE - 1))) >> PAGE_CACHE_SHIFT);
+		/* convert size to amount of
+		   pages */
+		reiserfs_write_lock(inode->i_sb);
+		if (num_pages > REISERFS_WRITE_PAGES_AT_A_TIME
+		    || num_pages > reiserfs_can_fit_pages(inode->i_sb)) {
+			/* If we were asked to write more data than we want to or if there
+			   is not that much space, then we shorten amount of data to write
+			   for this iteration. */
+			num_pages =
+			    min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME,
+				  reiserfs_can_fit_pages(inode->i_sb));
+			/* Also we should not forget to set size in bytes accordingly */
+			write_bytes = (num_pages << PAGE_CACHE_SHIFT) -
+			    (pos & (PAGE_CACHE_SIZE - 1));
+			/* If position is not on the
+			   start of the page, we need
+			   to substract the offset
+			   within page */
+		} else
+			write_bytes = count;
+
+		/* reserve the blocks to be allocated later, so that later on
+		   we still have the space to write the blocks to */
+		reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
+						      num_pages <<
+						      (PAGE_CACHE_SHIFT -
+						       inode->i_blkbits));
+		reiserfs_write_unlock(inode->i_sb);
+
+		if (!num_pages) {	/* If we do not have enough space even for a single page... */
+			if (pos >
+			    inode->i_size + inode->i_sb->s_blocksize -
+			    (pos & (inode->i_sb->s_blocksize - 1))) {
+				res = -ENOSPC;
+				break;	// In case we are writing past the end of the last file block, break.
+			}
+			// Otherwise we are possibly overwriting the file, so
+			// let's set write size to be equal or less than blocksize.
+			// This way we get it correctly for file holes.
+			// But overwriting files on absolutelly full volumes would not
+			// be very efficient. Well, people are not supposed to fill
+			// 100% of disk space anyway.
+			write_bytes =
+			    min_t(size_t, count,
+				  inode->i_sb->s_blocksize -
+				  (pos & (inode->i_sb->s_blocksize - 1)));
+			num_pages = 1;
+			// No blocks were claimed before, so do it now.
+			reiserfs_claim_blocks_to_be_allocated(inode->i_sb,
+							      1 <<
+							      (PAGE_CACHE_SHIFT
+							       -
+							       inode->
+							       i_blkbits));
+		}
 
-    while ( count > 0) {
-	/* This is the main loop in which we running until some error occures
-	   or until we write all of the data. */
-	size_t num_pages;/* amount of pages we are going to write this iteration */
-	size_t write_bytes; /* amount of bytes to write during this iteration */
-	size_t blocks_to_allocate; /* how much blocks we need to allocate for this iteration */
-        
-        /*  (pos & (PAGE_CACHE_SIZE-1)) is an idiom for offset into a page of pos*/
-	num_pages = !!((pos+count) & (PAGE_CACHE_SIZE - 1)) + /* round up partial
-							  pages */
-		    ((count + (pos & (PAGE_CACHE_SIZE-1))) >> PAGE_CACHE_SHIFT); 
-						/* convert size to amount of
-						   pages */
-	reiserfs_write_lock(inode->i_sb);
-	if ( num_pages > REISERFS_WRITE_PAGES_AT_A_TIME 
-		|| num_pages > reiserfs_can_fit_pages(inode->i_sb) ) {
-	    /* If we were asked to write more data than we want to or if there
-	       is not that much space, then we shorten amount of data to write
-	       for this iteration. */
-	    num_pages = min_t(size_t, REISERFS_WRITE_PAGES_AT_A_TIME, reiserfs_can_fit_pages(inode->i_sb));
-	    /* Also we should not forget to set size in bytes accordingly */
-	    write_bytes = (num_pages << PAGE_CACHE_SHIFT) - 
-			    (pos & (PAGE_CACHE_SIZE-1));
-					 /* If position is not on the
-					    start of the page, we need
-					    to substract the offset
-					    within page */
-	} else
-	    write_bytes = count;
+		/* Prepare for writing into the region, read in all the
+		   partially overwritten pages, if needed. And lock the pages,
+		   so that nobody else can access these until we are done.
+		   We get number of actual blocks needed as a result. */
+		blocks_to_allocate =
+		    reiserfs_prepare_file_region_for_write(inode, pos,
+							   num_pages,
+							   write_bytes,
+							   prepared_pages);
+		if (blocks_to_allocate < 0) {
+			res = blocks_to_allocate;
+			reiserfs_release_claimed_blocks(inode->i_sb,
+							num_pages <<
+							(PAGE_CACHE_SHIFT -
+							 inode->i_blkbits));
+			break;
+		}
 
-	/* reserve the blocks to be allocated later, so that later on
-	   we still have the space to write the blocks to */
-	reiserfs_claim_blocks_to_be_allocated(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
-	reiserfs_write_unlock(inode->i_sb);
+		/* First we correct our estimate of how many blocks we need */
+		reiserfs_release_claimed_blocks(inode->i_sb,
+						(num_pages <<
+						 (PAGE_CACHE_SHIFT -
+						  inode->i_sb->
+						  s_blocksize_bits)) -
+						blocks_to_allocate);
+
+		if (blocks_to_allocate > 0) {	/*We only allocate blocks if we need to */
+			/* Fill in all the possible holes and append the file if needed */
+			res =
+			    reiserfs_allocate_blocks_for_region(&th, inode, pos,
+								num_pages,
+								write_bytes,
+								prepared_pages,
+								blocks_to_allocate);
+		}
 
-	if ( !num_pages ) { /* If we do not have enough space even for a single page... */
-	    if ( pos > inode->i_size+inode->i_sb->s_blocksize-(pos & (inode->i_sb->s_blocksize-1))) {
-		res = -ENOSPC;
-		break; // In case we are writing past the end of the last file block, break.
-	    }
-	    // Otherwise we are possibly overwriting the file, so
-	    // let's set write size to be equal or less than blocksize.
-	    // This way we get it correctly for file holes.
-	    // But overwriting files on absolutelly full volumes would not
-	    // be very efficient. Well, people are not supposed to fill
-	    // 100% of disk space anyway.
-	    write_bytes = min_t(size_t, count, inode->i_sb->s_blocksize - (pos & (inode->i_sb->s_blocksize - 1)));
-	    num_pages = 1;
-	    // No blocks were claimed before, so do it now.
-	    reiserfs_claim_blocks_to_be_allocated(inode->i_sb, 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits));
-	}
+		/* well, we have allocated the blocks, so it is time to free
+		   the reservation we made earlier. */
+		reiserfs_release_claimed_blocks(inode->i_sb,
+						blocks_to_allocate);
+		if (res) {
+			reiserfs_unprepare_pages(prepared_pages, num_pages);
+			break;
+		}
 
-	/* Prepare for writing into the region, read in all the
-	   partially overwritten pages, if needed. And lock the pages,
-	   so that nobody else can access these until we are done.
-	   We get number of actual blocks needed as a result.*/
-	blocks_to_allocate = reiserfs_prepare_file_region_for_write(inode, pos, num_pages, write_bytes, prepared_pages);
-	if ( blocks_to_allocate < 0 ) {
-	    res = blocks_to_allocate;
-	    reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - inode->i_blkbits));
-	    break;
-	}
+/* NOTE that allocating blocks and filling blocks can be done in reverse order
+   and probably we would do that just to get rid of garbage in files after a
+   crash */
 
-	/* First we correct our estimate of how many blocks we need */
-	reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits)) - blocks_to_allocate );
+		/* Copy data from user-supplied buffer to file's pages */
+		res =
+		    reiserfs_copy_from_user_to_file_region(pos, num_pages,
+							   write_bytes,
+							   prepared_pages, buf);
+		if (res) {
+			reiserfs_unprepare_pages(prepared_pages, num_pages);
+			break;
+		}
 
-	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
-	    /* Fill in all the possible holes and append the file if needed */
-	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
+		/* Send the pages to disk and unlock them. */
+		res =
+		    reiserfs_submit_file_region_for_write(&th, inode, pos,
+							  num_pages,
+							  write_bytes,
+							  prepared_pages);
+		if (res)
+			break;
+
+		already_written += write_bytes;
+		buf += write_bytes;
+		*ppos = pos += write_bytes;
+		count -= write_bytes;
+		balance_dirty_pages_ratelimited(inode->i_mapping);
 	}
 
-	/* well, we have allocated the blocks, so it is time to free
-	   the reservation we made earlier. */
-	reiserfs_release_claimed_blocks(inode->i_sb, blocks_to_allocate);
-	if ( res ) {
-	    reiserfs_unprepare_pages(prepared_pages, num_pages);
-	    break;
+	/* this is only true on error */
+	if (th.t_trans_id) {
+		reiserfs_write_lock(inode->i_sb);
+		err = journal_end(&th, th.t_super, th.t_blocks_allocated);
+		reiserfs_write_unlock(inode->i_sb);
+		if (err) {
+			res = err;
+			goto out;
+		}
 	}
 
-/* NOTE that allocating blocks and filling blocks can be done in reverse order
-   and probably we would do that just to get rid of garbage in files after a
-   crash */
+	if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
+		res =
+		    generic_osync_inode(inode, file->f_mapping,
+					OSYNC_METADATA | OSYNC_DATA);
 
-	/* Copy data from user-supplied buffer to file's pages */
-	res = reiserfs_copy_from_user_to_file_region(pos, num_pages, write_bytes, prepared_pages, buf);
-	if ( res ) {
-	    reiserfs_unprepare_pages(prepared_pages, num_pages);
-	    break;
-	}
+	up(&inode->i_sem);
+	reiserfs_async_progress_wait(inode->i_sb);
+	return (already_written != 0) ? already_written : res;
 
-	/* Send the pages to disk and unlock them. */
-	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
-	                                            write_bytes,prepared_pages);
-	if ( res )
-	    break;
-
-	already_written += write_bytes;
-	buf += write_bytes;
-	*ppos = pos += write_bytes;
-	count -= write_bytes;
-	balance_dirty_pages_ratelimited(inode->i_mapping);
-    }
-
-    /* this is only true on error */
-    if (th.t_trans_id) {
-        reiserfs_write_lock(inode->i_sb);
-        err = journal_end(&th, th.t_super, th.t_blocks_allocated);
-        reiserfs_write_unlock(inode->i_sb);
-        if (err) {
-            res = err;
-            goto out;
-        }
-    }
-
-    if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
-	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
-
-    up(&inode->i_sem);
-    reiserfs_async_progress_wait(inode->i_sb);
-    return (already_written != 0)?already_written:res;
-
-out:
-    up(&inode->i_sem); // unlock the file on exit.
-    return res;
+      out:
+	up(&inode->i_sem);	// unlock the file on exit.
+	return res;
 }
 
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user *buf,
-			       size_t count, loff_t pos)
+static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
+				  size_t count, loff_t pos)
 {
-    return generic_file_aio_write(iocb, buf, count, pos);
+	return generic_file_aio_write(iocb, buf, count, pos);
 }
 
-
-
 struct file_operations reiserfs_file_operations = {
-    .read	= generic_file_read,
-    .write	= reiserfs_file_write,
-    .ioctl	= reiserfs_ioctl,
-    .mmap	= generic_file_mmap,
-    .release	= reiserfs_file_release,
-    .fsync	= reiserfs_sync_file,
-    .sendfile	= generic_file_sendfile,
-    .aio_read   = generic_file_aio_read,
-    .aio_write  = reiserfs_aio_write,
+	.read = generic_file_read,
+	.write = reiserfs_file_write,
+	.ioctl = reiserfs_ioctl,
+	.mmap = generic_file_mmap,
+	.release = reiserfs_file_release,
+	.fsync = reiserfs_sync_file,
+	.sendfile = generic_file_sendfile,
+	.aio_read = generic_file_aio_read,
+	.aio_write = reiserfs_aio_write,
 };
 
-
-struct  inode_operations reiserfs_file_inode_operations = {
-    .truncate	= reiserfs_vfs_truncate_file,
-    .setattr    = reiserfs_setattr,
-    .setxattr   = reiserfs_setxattr,
-    .getxattr   = reiserfs_getxattr,
-    .listxattr  = reiserfs_listxattr,
-    .removexattr = reiserfs_removexattr,
-    .permission = reiserfs_permission,
+struct inode_operations reiserfs_file_inode_operations = {
+	.truncate = reiserfs_vfs_truncate_file,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
 };
-
-
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index e4f64be9e15b..2706e2adffab 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -34,14 +34,12 @@
  ** 
  **/
 
-
 #include <linux/config.h>
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/reiserfs_fs.h>
 #include <linux/buffer_head.h>
 
-
 /* To make any changes in the tree we find a node, that contains item
    to be changed/deleted or position in the node we insert a new item
    to. We call this node S. To do balancing we need to decide what we
@@ -56,490 +54,522 @@
    have to have if we do not any shiftings, if we shift to left/right
    neighbor or to both. */
 
-
 /* taking item number in virtual node, returns number of item, that it has in source buffer */
-static inline int old_item_num (int new_num, int affected_item_num, int mode)
+static inline int old_item_num(int new_num, int affected_item_num, int mode)
 {
-  if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
-    return new_num;
+	if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
+		return new_num;
 
-  if (mode == M_INSERT) {
+	if (mode == M_INSERT) {
 
-    RFALSE( new_num == 0, 
-	    "vs-8005: for INSERT mode and item number of inserted item");
+		RFALSE(new_num == 0,
+		       "vs-8005: for INSERT mode and item number of inserted item");
 
-    return new_num - 1;
-  }
+		return new_num - 1;
+	}
 
-  RFALSE( mode != M_DELETE,
-	  "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode);
-  /* delete mode */
-  return new_num + 1;
+	RFALSE(mode != M_DELETE,
+	       "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
+	       mode);
+	/* delete mode */
+	return new_num + 1;
 }
 
-static void create_virtual_node (struct tree_balance * tb, int h)
+static void create_virtual_node(struct tree_balance *tb, int h)
 {
-    struct item_head * ih;
-    struct virtual_node * vn = tb->tb_vn;
-    int new_num;
-    struct buffer_head * Sh;	/* this comes from tb->S[h] */
+	struct item_head *ih;
+	struct virtual_node *vn = tb->tb_vn;
+	int new_num;
+	struct buffer_head *Sh;	/* this comes from tb->S[h] */
 
-    Sh = PATH_H_PBUFFER (tb->tb_path, h);
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
 
-    /* size of changed node */
-    vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h];
+	/* size of changed node */
+	vn->vn_size =
+	    MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
 
-    /* for internal nodes array if virtual items is not created */
-    if (h) {
-	vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
-	return;
-    }
-
-    /* number of items in virtual node  */
-    vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0);
-
-    /* first virtual item */
-    vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
-    memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item));
-    vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item);
-
-
-    /* first item in the node */
-    ih = B_N_PITEM_HEAD (Sh, 0);
-
-    /* define the mergeability for 0-th item (if it is not being deleted) */
-    if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
-	    vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
-
-    /* go through all items those remain in the virtual node (except for the new (inserted) one) */
-    for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) {
-	int j;
-	struct virtual_item * vi = vn->vn_vi + new_num;
-	int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1);
-    
-
-	if (is_affected && vn->vn_mode == M_INSERT)
-	    continue;
-    
-	/* get item number in source node */
-	j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode);
-    
-	vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
-	vi->vi_ih = ih + j;
-	vi->vi_item = B_I_PITEM (Sh, ih + j);
-	vi->vi_uarea = vn->vn_free_ptr;
-
-	// FIXME: there is no check, that item operation did not
-	// consume too much memory
-	vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]);
-	if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
-	    reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: "
-			    "virtual node space consumed");
-
-	if (!is_affected)
-	    /* this is not being changed */
-	    continue;
-    
-	if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
-	    vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
-	    vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted
+	/* for internal nodes array if virtual items is not created */
+	if (h) {
+		vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
+		return;
 	}
-    }
-
-  
-    /* virtual inserted item is not defined yet */
-    if (vn->vn_mode == M_INSERT) {
-	struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num;
-      
-	RFALSE( vn->vn_ins_ih == 0,
-		"vs-8040: item header of inserted item is not specified");
-	vi->vi_item_len = tb->insert_size[0];
-	vi->vi_ih = vn->vn_ins_ih;
-	vi->vi_item = vn->vn_data;
-	vi->vi_uarea = vn->vn_free_ptr;
-	
-	op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]);
-    }
-  
-    /* set right merge flag we take right delimiting key and check whether it is a mergeable item */
-    if (tb->CFR[0]) {
-	struct reiserfs_key * key;
-
-	key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]);
-	if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE ||
-						       vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1))
-		vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE;
 
-#ifdef CONFIG_REISERFS_CHECK
-	if (op_is_left_mergeable (key, Sh->b_size) &&
-	    !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) {
-	    /* we delete last item and it could be merged with right neighbor's first item */
-	    if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) &&
-		  I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) {
-		/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
-		print_block (Sh, 0, -1, -1);
-		reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", 
-				key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE);
-	    } else
-		/* we can delete directory item, that has only one directory entry in it */
-		;
+	/* number of items in virtual node  */
+	vn->vn_nr_item =
+	    B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
+	    ((vn->vn_mode == M_DELETE) ? 1 : 0);
+
+	/* first virtual item */
+	vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
+	memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
+	vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
+
+	/* first item in the node */
+	ih = B_N_PITEM_HEAD(Sh, 0);
+
+	/* define the mergeability for 0-th item (if it is not being deleted) */
+	if (op_is_left_mergeable(&(ih->ih_key), Sh->b_size)
+	    && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
+		vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
+
+	/* go through all items those remain in the virtual node (except for the new (inserted) one) */
+	for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
+		int j;
+		struct virtual_item *vi = vn->vn_vi + new_num;
+		int is_affected =
+		    ((new_num != vn->vn_affected_item_num) ? 0 : 1);
+
+		if (is_affected && vn->vn_mode == M_INSERT)
+			continue;
+
+		/* get item number in source node */
+		j = old_item_num(new_num, vn->vn_affected_item_num,
+				 vn->vn_mode);
+
+		vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
+		vi->vi_ih = ih + j;
+		vi->vi_item = B_I_PITEM(Sh, ih + j);
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		// FIXME: there is no check, that item operation did not
+		// consume too much memory
+		vn->vn_free_ptr +=
+		    op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
+		if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
+			reiserfs_panic(tb->tb_sb,
+				       "vs-8030: create_virtual_node: "
+				       "virtual node space consumed");
+
+		if (!is_affected)
+			/* this is not being changed */
+			continue;
+
+		if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
+			vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
+			vi->vi_new_data = vn->vn_data;	// pointer to data which is going to be pasted
+		}
 	}
+
+	/* virtual inserted item is not defined yet */
+	if (vn->vn_mode == M_INSERT) {
+		struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
+
+		RFALSE(vn->vn_ins_ih == 0,
+		       "vs-8040: item header of inserted item is not specified");
+		vi->vi_item_len = tb->insert_size[0];
+		vi->vi_ih = vn->vn_ins_ih;
+		vi->vi_item = vn->vn_data;
+		vi->vi_uarea = vn->vn_free_ptr;
+
+		op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
+			     tb->insert_size[0]);
+	}
+
+	/* set right merge flag we take right delimiting key and check whether it is a mergeable item */
+	if (tb->CFR[0]) {
+		struct reiserfs_key *key;
+
+		key = B_N_PDELIM_KEY(tb->CFR[0], tb->rkey[0]);
+		if (op_is_left_mergeable(key, Sh->b_size)
+		    && (vn->vn_mode != M_DELETE
+			|| vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
+			vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
+			    VI_TYPE_RIGHT_MERGEABLE;
+
+#ifdef CONFIG_REISERFS_CHECK
+		if (op_is_left_mergeable(key, Sh->b_size) &&
+		    !(vn->vn_mode != M_DELETE
+		      || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
+			/* we delete last item and it could be merged with right neighbor's first item */
+			if (!
+			    (B_NR_ITEMS(Sh) == 1
+			     && is_direntry_le_ih(B_N_PITEM_HEAD(Sh, 0))
+			     && I_ENTRY_COUNT(B_N_PITEM_HEAD(Sh, 0)) == 1)) {
+				/* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */
+				print_block(Sh, 0, -1, -1);
+				reiserfs_panic(tb->tb_sb,
+					       "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c",
+					       key, vn->vn_affected_item_num,
+					       vn->vn_mode, M_DELETE);
+			} else
+				/* we can delete directory item, that has only one directory entry in it */
+				;
+		}
 #endif
-    
-    }
-}
 
+	}
+}
 
 /* using virtual node check, how many items can be shifted to left
    neighbor */
-static void check_left (struct tree_balance * tb, int h, int cur_free)
+static void check_left(struct tree_balance *tb, int h, int cur_free)
 {
-    int i;
-    struct virtual_node * vn = tb->tb_vn;
-    struct virtual_item * vi;
-    int d_size, ih_size;
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
 
-    RFALSE( cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
+	RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
 
-    /* internal level */
-    if (h > 0) {	
-	tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-	return;
-    }
+	/* internal level */
+	if (h > 0) {
+		tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
+	}
 
-    /* leaf level */
+	/* leaf level */
 
-    if (!cur_free || !vn->vn_nr_item) {
-	/* no free space or nothing to move */
-	tb->lnum[h] = 0;
-	tb->lbytes = -1;
-	return;
-    }
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space or nothing to move */
+		tb->lnum[h] = 0;
+		tb->lbytes = -1;
+		return;
+	}
 
-    RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
-	    "vs-8055: parent does not exist or invalid");
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8055: parent does not exist or invalid");
 
-    vi = vn->vn_vi;
-    if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
-	/* all contents of S[0] fits into L[0] */
+	vi = vn->vn_vi;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into L[0] */
 
-	RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		"vs-8055: invalid mode or balance condition failed");
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8055: invalid mode or balance condition failed");
 
-	tb->lnum[0] = vn->vn_nr_item;
-	tb->lbytes = -1;
-	return;
-    }
-  
-
-    d_size = 0, ih_size = IH_SIZE;
-
-    /* first item may be merge with last item in left neighbor */
-    if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
-	d_size = -((int)IH_SIZE), ih_size = 0;
-
-    tb->lnum[0] = 0;
-    for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) {
-	d_size += vi->vi_item_len;
-	if (cur_free >= d_size) {	
-	    /* the item can be shifted entirely */
-	    cur_free -= d_size;
-	    tb->lnum[0] ++;
-	    continue;
+		tb->lnum[0] = vn->vn_nr_item;
+		tb->lbytes = -1;
+		return;
 	}
-      
-	/* the item cannot be shifted entirely, try to split it */
-	/* check whether L[0] can hold ih and at least one byte of the item body */
-	if (cur_free <= ih_size) {
-	    /* cannot shift even a part of the current item */
-	    tb->lbytes = -1;
-	    return;
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* first item may be merge with last item in left neighbor */
+	if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
+		d_size = -((int)IH_SIZE), ih_size = 0;
+
+	tb->lnum[0] = 0;
+	for (i = 0; i < vn->vn_nr_item;
+	     i++, ih_size = IH_SIZE, d_size = 0, vi++) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->lnum[0]++;
+			continue;
+		}
+
+		/* the item cannot be shifted entirely, try to split it */
+		/* check whether L[0] can hold ih and at least one byte of the item body */
+		if (cur_free <= ih_size) {
+			/* cannot shift even a part of the current item */
+			tb->lbytes = -1;
+			return;
+		}
+		cur_free -= ih_size;
+
+		tb->lbytes = op_check_left(vi, cur_free, 0, 0);
+		if (tb->lbytes != -1)
+			/* count partially shifted item */
+			tb->lnum[0]++;
+
+		break;
 	}
-	cur_free -= ih_size;
-    
-	tb->lbytes = op_check_left (vi, cur_free, 0, 0);
-	if (tb->lbytes != -1)
-	    /* count partially shifted item */
-	    tb->lnum[0] ++;
-    
-	break;
-    }
-  
-    return;
-}
 
+	return;
+}
 
 /* using virtual node check, how many items can be shifted to right
    neighbor */
-static void check_right (struct tree_balance * tb, int h, int cur_free)
+static void check_right(struct tree_balance *tb, int h, int cur_free)
 {
-    int i;
-    struct virtual_node * vn = tb->tb_vn;
-    struct virtual_item * vi;
-    int d_size, ih_size;
-
-    RFALSE( cur_free < 0, "vs-8070: cur_free < 0");
-    
-    /* internal level */
-    if (h > 0) {
-	tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
-	return;
-    }
-    
-    /* leaf level */
-    
-    if (!cur_free || !vn->vn_nr_item) {
-	/* no free space  */
-	tb->rnum[h] = 0;
-	tb->rbytes = -1;
-	return;
-    }
-  
-    RFALSE( !PATH_H_PPARENT (tb->tb_path, 0),
-	    "vs-8075: parent does not exist or invalid");
-  
-    vi = vn->vn_vi + vn->vn_nr_item - 1;
-    if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
-	/* all contents of S[0] fits into R[0] */
-	
-	RFALSE( vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
-		"vs-8080: invalid mode or balance condition failed");
-
-	tb->rnum[h] = vn->vn_nr_item;
-	tb->rbytes = -1;
-	return;
-    }
-    
-    d_size = 0, ih_size = IH_SIZE;
-    
-    /* last item may be merge with first item in right neighbor */
-    if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
-	d_size = -(int)IH_SIZE, ih_size = 0;
-
-    tb->rnum[0] = 0;
-    for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) {
-	d_size += vi->vi_item_len;
-	if (cur_free >= d_size) {	
-	    /* the item can be shifted entirely */
-	    cur_free -= d_size;
-	    tb->rnum[0] ++;
-	    continue;
+	int i;
+	struct virtual_node *vn = tb->tb_vn;
+	struct virtual_item *vi;
+	int d_size, ih_size;
+
+	RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
+
+	/* internal level */
+	if (h > 0) {
+		tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
+		return;
 	}
-	
-	/* check whether R[0] can hold ih and at least one byte of the item body */
-	if ( cur_free <= ih_size ) {    /* cannot shift even a part of the current item */
-	    tb->rbytes = -1;
-	    return;
+
+	/* leaf level */
+
+	if (!cur_free || !vn->vn_nr_item) {
+		/* no free space  */
+		tb->rnum[h] = 0;
+		tb->rbytes = -1;
+		return;
 	}
-	
-	/* R[0] can hold the header of the item and at least one byte of its body */
-	cur_free -= ih_size;	/* cur_free is still > 0 */
-
-	tb->rbytes = op_check_right (vi, cur_free);
-	if (tb->rbytes != -1)
-	    /* count partially shifted item */
-	    tb->rnum[0] ++;
-    
-	break;
-    }
-	
-  return;
-}
 
+	RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
+	       "vs-8075: parent does not exist or invalid");
+
+	vi = vn->vn_vi + vn->vn_nr_item - 1;
+	if ((unsigned int)cur_free >=
+	    (vn->vn_size -
+	     ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
+		/* all contents of S[0] fits into R[0] */
+
+		RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
+		       "vs-8080: invalid mode or balance condition failed");
+
+		tb->rnum[h] = vn->vn_nr_item;
+		tb->rbytes = -1;
+		return;
+	}
+
+	d_size = 0, ih_size = IH_SIZE;
+
+	/* last item may be merge with first item in right neighbor */
+	if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
+		d_size = -(int)IH_SIZE, ih_size = 0;
+
+	tb->rnum[0] = 0;
+	for (i = vn->vn_nr_item - 1; i >= 0;
+	     i--, d_size = 0, ih_size = IH_SIZE, vi--) {
+		d_size += vi->vi_item_len;
+		if (cur_free >= d_size) {
+			/* the item can be shifted entirely */
+			cur_free -= d_size;
+			tb->rnum[0]++;
+			continue;
+		}
+
+		/* check whether R[0] can hold ih and at least one byte of the item body */
+		if (cur_free <= ih_size) {	/* cannot shift even a part of the current item */
+			tb->rbytes = -1;
+			return;
+		}
+
+		/* R[0] can hold the header of the item and at least one byte of its body */
+		cur_free -= ih_size;	/* cur_free is still > 0 */
+
+		tb->rbytes = op_check_right(vi, cur_free);
+		if (tb->rbytes != -1)
+			/* count partially shifted item */
+			tb->rnum[0]++;
+
+		break;
+	}
+
+	return;
+}
 
 /*
  * from - number of items, which are shifted to left neighbor entirely
  * to - number of item, which are shifted to right neighbor entirely
  * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor
  * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */
-static int get_num_ver (int mode, struct tree_balance * tb, int h,
-			int from, int from_bytes,
-			int to,   int to_bytes,
-			short * snum012, int flow
-    )
+static int get_num_ver(int mode, struct tree_balance *tb, int h,
+		       int from, int from_bytes,
+		       int to, int to_bytes, short *snum012, int flow)
 {
-    int i;
-    int cur_free;
-    //    int bytes;
-    int units;
-    struct virtual_node * vn = tb->tb_vn;
-    //    struct virtual_item * vi;
-
-    int total_node_size, max_node_size, current_item_size;
-    int needed_nodes;
-    int start_item, 	/* position of item we start filling node from */
-	end_item,	/* position of item we finish filling node by */
-	start_bytes,/* number of first bytes (entries for directory) of start_item-th item 
-		       we do not include into node that is being filled */
-	end_bytes;	/* number of last bytes (entries for directory) of end_item-th item 
-			   we do node include into node that is being filled */
-    int split_item_positions[2]; /* these are positions in virtual item of
-				    items, that are split between S[0] and
-				    S1new and S1new and S2new */
-
-    split_item_positions[0] = -1;
-    split_item_positions[1] = -1;
-
-    /* We only create additional nodes if we are in insert or paste mode
-       or we are in replace mode at the internal level. If h is 0 and
-       the mode is M_REPLACE then in fix_nodes we change the mode to
-       paste or insert before we get here in the code.  */
-    RFALSE( tb->insert_size[h] < 0  || (mode != M_INSERT && mode != M_PASTE),
-	    "vs-8100: insert_size < 0 in overflow");
-
-    max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h));
-
-    /* snum012 [0-2] - number of items, that lay
-       to S[0], first new node and second new node */
-    snum012[3] = -1;	/* s1bytes */
-    snum012[4] = -1;	/* s2bytes */
-
-    /* internal level */
-    if (h > 0) {
-	i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
-	if (i == max_node_size)
-	    return 1;
-	return (i / max_node_size + 1);
-    }
-
-    /* leaf level */
-    needed_nodes = 1;
-    total_node_size = 0;
-    cur_free = max_node_size;
-
-    // start from 'from'-th item
-    start_item = from;
-    // skip its first 'start_bytes' units
-    start_bytes = ((from_bytes != -1) ? from_bytes : 0);
-
-    // last included item is the 'end_item'-th one
-    end_item = vn->vn_nr_item - to - 1;
-    // do not count last 'end_bytes' units of 'end_item'-th item
-    end_bytes = (to_bytes != -1) ? to_bytes : 0;
-
-    /* go through all item beginning from the start_item-th item and ending by
-       the end_item-th item. Do not count first 'start_bytes' units of
-       'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
-    
-    for (i = start_item; i <= end_item; i ++) {
-	struct virtual_item * vi = vn->vn_vi + i;
-	int skip_from_end = ((i == end_item) ? end_bytes : 0);
-
-	RFALSE( needed_nodes > 3, "vs-8105: too many nodes are needed");
-
-	/* get size of current item */
-	current_item_size = vi->vi_item_len;
-
-	/* do not take in calculation head part (from_bytes) of from-th item */
-	current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes);
-
-	/* do not take in calculation tail part of last item */
-	current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end);
-
-	/* if item fits into current node entierly */
-	if (total_node_size + current_item_size <= max_node_size) {
-	    snum012[needed_nodes - 1] ++;
-	    total_node_size += current_item_size;
-	    start_bytes = 0;
-	    continue;
+	int i;
+	int cur_free;
+	//    int bytes;
+	int units;
+	struct virtual_node *vn = tb->tb_vn;
+	//    struct virtual_item * vi;
+
+	int total_node_size, max_node_size, current_item_size;
+	int needed_nodes;
+	int start_item,		/* position of item we start filling node from */
+	 end_item,		/* position of item we finish filling node by */
+	 start_bytes,		/* number of first bytes (entries for directory) of start_item-th item 
+				   we do not include into node that is being filled */
+	 end_bytes;		/* number of last bytes (entries for directory) of end_item-th item 
+				   we do node include into node that is being filled */
+	int split_item_positions[2];	/* these are positions in virtual item of
+					   items, that are split between S[0] and
+					   S1new and S1new and S2new */
+
+	split_item_positions[0] = -1;
+	split_item_positions[1] = -1;
+
+	/* We only create additional nodes if we are in insert or paste mode
+	   or we are in replace mode at the internal level. If h is 0 and
+	   the mode is M_REPLACE then in fix_nodes we change the mode to
+	   paste or insert before we get here in the code.  */
+	RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
+	       "vs-8100: insert_size < 0 in overflow");
+
+	max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
+
+	/* snum012 [0-2] - number of items, that lay
+	   to S[0], first new node and second new node */
+	snum012[3] = -1;	/* s1bytes */
+	snum012[4] = -1;	/* s2bytes */
+
+	/* internal level */
+	if (h > 0) {
+		i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
+		if (i == max_node_size)
+			return 1;
+		return (i / max_node_size + 1);
 	}
 
-	if (current_item_size > max_node_size) {
-	    /* virtual item length is longer, than max size of item in
-               a node. It is impossible for direct item */
-	    RFALSE( is_direct_le_ih (vi->vi_ih),
-		    "vs-8110: "
-		    "direct item length is %d. It can not be longer than %d",
-		    current_item_size, max_node_size);
-	    /* we will try to split it */
-	    flow = 1;
+	/* leaf level */
+	needed_nodes = 1;
+	total_node_size = 0;
+	cur_free = max_node_size;
+
+	// start from 'from'-th item
+	start_item = from;
+	// skip its first 'start_bytes' units
+	start_bytes = ((from_bytes != -1) ? from_bytes : 0);
+
+	// last included item is the 'end_item'-th one
+	end_item = vn->vn_nr_item - to - 1;
+	// do not count last 'end_bytes' units of 'end_item'-th item
+	end_bytes = (to_bytes != -1) ? to_bytes : 0;
+
+	/* go through all item beginning from the start_item-th item and ending by
+	   the end_item-th item. Do not count first 'start_bytes' units of
+	   'start_item'-th item and last 'end_bytes' of 'end_item'-th item */
+
+	for (i = start_item; i <= end_item; i++) {
+		struct virtual_item *vi = vn->vn_vi + i;
+		int skip_from_end = ((i == end_item) ? end_bytes : 0);
+
+		RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
+
+		/* get size of current item */
+		current_item_size = vi->vi_item_len;
+
+		/* do not take in calculation head part (from_bytes) of from-th item */
+		current_item_size -=
+		    op_part_size(vi, 0 /*from start */ , start_bytes);
+
+		/* do not take in calculation tail part of last item */
+		current_item_size -=
+		    op_part_size(vi, 1 /*from end */ , skip_from_end);
+
+		/* if item fits into current node entierly */
+		if (total_node_size + current_item_size <= max_node_size) {
+			snum012[needed_nodes - 1]++;
+			total_node_size += current_item_size;
+			start_bytes = 0;
+			continue;
+		}
+
+		if (current_item_size > max_node_size) {
+			/* virtual item length is longer, than max size of item in
+			   a node. It is impossible for direct item */
+			RFALSE(is_direct_le_ih(vi->vi_ih),
+			       "vs-8110: "
+			       "direct item length is %d. It can not be longer than %d",
+			       current_item_size, max_node_size);
+			/* we will try to split it */
+			flow = 1;
+		}
+
+		if (!flow) {
+			/* as we do not split items, take new node and continue */
+			needed_nodes++;
+			i--;
+			total_node_size = 0;
+			continue;
+		}
+		// calculate number of item units which fit into node being
+		// filled
+		{
+			int free_space;
+
+			free_space = max_node_size - total_node_size - IH_SIZE;
+			units =
+			    op_check_left(vi, free_space, start_bytes,
+					  skip_from_end);
+			if (units == -1) {
+				/* nothing fits into current node, take new node and continue */
+				needed_nodes++, i--, total_node_size = 0;
+				continue;
+			}
+		}
+
+		/* something fits into the current node */
+		//if (snum012[3] != -1 || needed_nodes != 1)
+		//  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
+		//snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
+		start_bytes += units;
+		snum012[needed_nodes - 1 + 3] = units;
+
+		if (needed_nodes > 2)
+			reiserfs_warning(tb->tb_sb, "vs-8111: get_num_ver: "
+					 "split_item_position is out of boundary");
+		snum012[needed_nodes - 1]++;
+		split_item_positions[needed_nodes - 1] = i;
+		needed_nodes++;
+		/* continue from the same item with start_bytes != -1 */
+		start_item = i;
+		i--;
+		total_node_size = 0;
 	}
 
-	if (!flow) {
-	    /* as we do not split items, take new node and continue */
-	    needed_nodes ++; i --; total_node_size = 0;
-	    continue;
+	// sum012[4] (if it is not -1) contains number of units of which
+	// are to be in S1new, snum012[3] - to be in S0. They are supposed
+	// to be S1bytes and S2bytes correspondingly, so recalculate
+	if (snum012[4] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S1new;
+
+		split_item_num = split_item_positions[1];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S1new =
+		    ((split_item_positions[0] ==
+		      split_item_positions[1]) ? snum012[3] : 0);
+
+		// s2bytes
+		snum012[4] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
+		    bytes_to_r - bytes_to_l - bytes_to_S1new;
+
+		if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
+		    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
+			reiserfs_warning(tb->tb_sb, "vs-8115: get_num_ver: not "
+					 "directory or indirect item");
 	}
 
-	// calculate number of item units which fit into node being
-	// filled
-	{
-	    int free_space;
-
-	    free_space = max_node_size - total_node_size - IH_SIZE;
-	    units = op_check_left (vi, free_space, start_bytes, skip_from_end);
-	    if (units == -1) {
-		/* nothing fits into current node, take new node and continue */
-		needed_nodes ++, i--, total_node_size = 0;
-		continue;
-	    }
+	/* now we know S2bytes, calculate S1bytes */
+	if (snum012[3] > 0) {
+		int split_item_num;
+		int bytes_to_r, bytes_to_l;
+		int bytes_to_S2new;
+
+		split_item_num = split_item_positions[0];
+		bytes_to_l =
+		    ((from == split_item_num
+		      && from_bytes != -1) ? from_bytes : 0);
+		bytes_to_r =
+		    ((end_item == split_item_num
+		      && end_bytes != -1) ? end_bytes : 0);
+		bytes_to_S2new =
+		    ((split_item_positions[0] == split_item_positions[1]
+		      && snum012[4] != -1) ? snum012[4] : 0);
+
+		// s1bytes
+		snum012[3] =
+		    op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
+		    bytes_to_r - bytes_to_l - bytes_to_S2new;
 	}
 
-	/* something fits into the current node */
-	//if (snum012[3] != -1 || needed_nodes != 1)
-	//  reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required");
-	//snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units;
-	start_bytes += units;
-	snum012[needed_nodes - 1 + 3] = units;
-
-	if (needed_nodes > 2)
-	    reiserfs_warning (tb->tb_sb, "vs-8111: get_num_ver: "
-			      "split_item_position is out of boundary");
-	snum012[needed_nodes - 1] ++;
-	split_item_positions[needed_nodes - 1] = i;
-	needed_nodes ++;
-	/* continue from the same item with start_bytes != -1 */
-	start_item = i;
-	i --;
-	total_node_size = 0;
-    }
-
-    // sum012[4] (if it is not -1) contains number of units of which
-    // are to be in S1new, snum012[3] - to be in S0. They are supposed
-    // to be S1bytes and S2bytes correspondingly, so recalculate
-    if (snum012[4] > 0) {
-	int split_item_num;
-	int bytes_to_r, bytes_to_l;
-	int bytes_to_S1new;
-    
-	split_item_num = split_item_positions[1];
-	bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
-	bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
-	bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0);
-
-	// s2bytes
-	snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new;
-
-	if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
-	    vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
-	    reiserfs_warning (tb->tb_sb, "vs-8115: get_num_ver: not "
-			      "directory or indirect item");
-    }
-
-    /* now we know S2bytes, calculate S1bytes */
-    if (snum012[3] > 0) {
-	int split_item_num;
-	int bytes_to_r, bytes_to_l;
-	int bytes_to_S2new;
-    
-	split_item_num = split_item_positions[0];
-	bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0);
-	bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0);
-	bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0);
-
-	// s1bytes
-	snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new;
-    }
-    
-    return needed_nodes;
+	return needed_nodes;
 }
 
-
 #ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance * cur_tb;
+extern struct tree_balance *cur_tb;
 #endif
 
-
 /* Set parameters for balancing.
  * Performs write of results of analysis of balancing into structure tb,
  * where it will later be used by the functions that actually do the balancing. 
@@ -557,131 +587,130 @@ extern struct tree_balance * cur_tb;
  *	s1bytes	number of bytes which flow to the first  new node when S[0] splits (this number is contained in s012 array)
  */
 
-static void set_parameters (struct tree_balance * tb, int h, int lnum,
-			    int rnum, int blk_num, short * s012, int lb, int rb)
+static void set_parameters(struct tree_balance *tb, int h, int lnum,
+			   int rnum, int blk_num, short *s012, int lb, int rb)
 {
 
-  tb->lnum[h] = lnum;
-  tb->rnum[h] = rnum;
-  tb->blknum[h] = blk_num;
+	tb->lnum[h] = lnum;
+	tb->rnum[h] = rnum;
+	tb->blknum[h] = blk_num;
 
-  if (h == 0)
-    {  /* only for leaf level */
-      if (s012 != NULL)
-	{
-	  tb->s0num = * s012 ++,
-	  tb->s1num = * s012 ++,
-	  tb->s2num = * s012 ++;
-	  tb->s1bytes = * s012 ++;
-	  tb->s2bytes = * s012;
+	if (h == 0) {		/* only for leaf level */
+		if (s012 != NULL) {
+			tb->s0num = *s012++,
+			    tb->s1num = *s012++, tb->s2num = *s012++;
+			tb->s1bytes = *s012++;
+			tb->s2bytes = *s012;
+		}
+		tb->lbytes = lb;
+		tb->rbytes = rb;
 	}
-      tb->lbytes = lb;
-      tb->rbytes = rb;
-    }
-  PROC_INFO_ADD( tb -> tb_sb, lnum[ h ], lnum );
-  PROC_INFO_ADD( tb -> tb_sb, rnum[ h ], rnum );
-
-  PROC_INFO_ADD( tb -> tb_sb, lbytes[ h ], lb );
-  PROC_INFO_ADD( tb -> tb_sb, rbytes[ h ], rb );
-}
-
+	PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
+	PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
 
+	PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
+	PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
+}
 
 /* check, does node disappear if we shift tb->lnum[0] items to left
    neighbor and tb->rnum[0] to the right one. */
-static int is_leaf_removable (struct tree_balance * tb)
+static int is_leaf_removable(struct tree_balance *tb)
 {
-  struct virtual_node * vn = tb->tb_vn;
-  int to_left, to_right;
-  int size;
-  int remain_items;
-
-  /* number of items, that will be shifted to left (right) neighbor
-     entirely */
-  to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
-  to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
-  remain_items = vn->vn_nr_item;
-
-  /* how many items remain in S[0] after shiftings to neighbors */
-  remain_items -= (to_left + to_right);
-
-  if (remain_items < 1) {
-    /* all content of node can be shifted to neighbors */
-    set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1);    
-    return 1;
-  }
-  
-  if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
-    /* S[0] is not removable */
-    return 0;
-
-  /* check, whether we can divide 1 remaining item between neighbors */
-
-  /* get size of remaining item (in item units) */
-  size = op_unit_num (&(vn->vn_vi[to_left]));
-
-  if (tb->lbytes + tb->rbytes >= size) {
-    set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1);
-    return 1;
-  }
-
-  return 0;
-}
+	struct virtual_node *vn = tb->tb_vn;
+	int to_left, to_right;
+	int size;
+	int remain_items;
+
+	/* number of items, that will be shifted to left (right) neighbor
+	   entirely */
+	to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
+	to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
+	remain_items = vn->vn_nr_item;
+
+	/* how many items remain in S[0] after shiftings to neighbors */
+	remain_items -= (to_left + to_right);
+
+	if (remain_items < 1) {
+		/* all content of node can be shifted to neighbors */
+		set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
+			       NULL, -1, -1);
+		return 1;
+	}
 
+	if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
+		/* S[0] is not removable */
+		return 0;
+
+	/* check, whether we can divide 1 remaining item between neighbors */
+
+	/* get size of remaining item (in item units) */
+	size = op_unit_num(&(vn->vn_vi[to_left]));
+
+	if (tb->lbytes + tb->rbytes >= size) {
+		set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
+			       tb->lbytes, -1);
+		return 1;
+	}
+
+	return 0;
+}
 
 /* check whether L, S, R can be joined in one node */
-static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree)
+static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
 {
-  struct virtual_node * vn = tb->tb_vn;
-  int ih_size;
-  struct buffer_head *S0;
-
-  S0 = PATH_H_PBUFFER (tb->tb_path, 0);
-
-  ih_size = 0;
-  if (vn->vn_nr_item) {
-    if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
-      ih_size += IH_SIZE;
-    
-	if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE)
-	    ih_size += IH_SIZE;
-    } else {
-	/* there was only one item and it will be deleted */
-	struct item_head * ih;
-    
-    RFALSE( B_NR_ITEMS (S0) != 1,
-	    "vs-8125: item number must be 1: it is %d", B_NR_ITEMS(S0));
-
-    ih = B_N_PITEM_HEAD (S0, 0);
-    if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])))
-	if (is_direntry_le_ih (ih)) {
-	    /* Directory must be in correct state here: that is
-	       somewhere at the left side should exist first directory
-	       item. But the item being deleted can not be that first
-	       one because its right neighbor is item of the same
-	       directory. (But first item always gets deleted in last
-	       turn). So, neighbors of deleted item can be merged, so
-	       we can save ih_size */
-	    ih_size = IH_SIZE;
-	    
-	    /* we might check that left neighbor exists and is of the
-	       same directory */
-	    RFALSE(le_ih_k_offset (ih) == DOT_OFFSET,
-		"vs-8130: first directory item can not be removed until directory is not empty");
-      }
-    
-  }
-
-  if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) {
-    set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1);
-    PROC_INFO_INC( tb -> tb_sb, leaves_removable );
-    return 1;  
-  }
-  return 0;
-  
-}
+	struct virtual_node *vn = tb->tb_vn;
+	int ih_size;
+	struct buffer_head *S0;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+
+	ih_size = 0;
+	if (vn->vn_nr_item) {
+		if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
+			ih_size += IH_SIZE;
+
+		if (vn->vn_vi[vn->vn_nr_item - 1].
+		    vi_type & VI_TYPE_RIGHT_MERGEABLE)
+			ih_size += IH_SIZE;
+	} else {
+		/* there was only one item and it will be deleted */
+		struct item_head *ih;
+
+		RFALSE(B_NR_ITEMS(S0) != 1,
+		       "vs-8125: item number must be 1: it is %d",
+		       B_NR_ITEMS(S0));
+
+		ih = B_N_PITEM_HEAD(S0, 0);
+		if (tb->CFR[0]
+		    && !comp_short_le_keys(&(ih->ih_key),
+					   B_N_PDELIM_KEY(tb->CFR[0],
+							  tb->rkey[0])))
+			if (is_direntry_le_ih(ih)) {
+				/* Directory must be in correct state here: that is
+				   somewhere at the left side should exist first directory
+				   item. But the item being deleted can not be that first
+				   one because its right neighbor is item of the same
+				   directory. (But first item always gets deleted in last
+				   turn). So, neighbors of deleted item can be merged, so
+				   we can save ih_size */
+				ih_size = IH_SIZE;
+
+				/* we might check that left neighbor exists and is of the
+				   same directory */
+				RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
+				       "vs-8130: first directory item can not be removed until directory is not empty");
+			}
 
+	}
+
+	if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
+		set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
+		PROC_INFO_INC(tb->tb_sb, leaves_removable);
+		return 1;
+	}
+	return 0;
 
+}
 
 /* when we do not split item, lnum and rnum are numbers of entire items */
 #define SET_PAR_SHIFT_LEFT \
@@ -704,7 +733,6 @@ else \
 		     -1, -1);\
 }
 
-
 #define SET_PAR_SHIFT_RIGHT \
 if (h)\
 {\
@@ -724,214 +752,199 @@ else \
 		  -1, -1);\
 }
 
-
-static void free_buffers_in_tb (
-		       struct tree_balance * p_s_tb
-		       ) {
-  int n_counter;
-
-  decrement_counters_in_path(p_s_tb->tb_path);
-  
-  for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) {
-    decrement_bcount(p_s_tb->L[n_counter]);
-    p_s_tb->L[n_counter] = NULL;
-    decrement_bcount(p_s_tb->R[n_counter]);
-    p_s_tb->R[n_counter] = NULL;
-    decrement_bcount(p_s_tb->FL[n_counter]);
-    p_s_tb->FL[n_counter] = NULL;
-    decrement_bcount(p_s_tb->FR[n_counter]);
-    p_s_tb->FR[n_counter] = NULL;
-    decrement_bcount(p_s_tb->CFL[n_counter]);
-    p_s_tb->CFL[n_counter] = NULL;
-    decrement_bcount(p_s_tb->CFR[n_counter]);
-    p_s_tb->CFR[n_counter] = NULL;
-  }
+static void free_buffers_in_tb(struct tree_balance *p_s_tb)
+{
+	int n_counter;
+
+	decrement_counters_in_path(p_s_tb->tb_path);
+
+	for (n_counter = 0; n_counter < MAX_HEIGHT; n_counter++) {
+		decrement_bcount(p_s_tb->L[n_counter]);
+		p_s_tb->L[n_counter] = NULL;
+		decrement_bcount(p_s_tb->R[n_counter]);
+		p_s_tb->R[n_counter] = NULL;
+		decrement_bcount(p_s_tb->FL[n_counter]);
+		p_s_tb->FL[n_counter] = NULL;
+		decrement_bcount(p_s_tb->FR[n_counter]);
+		p_s_tb->FR[n_counter] = NULL;
+		decrement_bcount(p_s_tb->CFL[n_counter]);
+		p_s_tb->CFL[n_counter] = NULL;
+		decrement_bcount(p_s_tb->CFR[n_counter]);
+		p_s_tb->CFR[n_counter] = NULL;
+	}
 }
 
-
 /* Get new buffers for storing new nodes that are created while balancing.
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  *	        NO_DISK_SPACE - no disk space.
  */
 /* The function is NOT SCHEDULE-SAFE! */
-static int  get_empty_nodes(
-              struct tree_balance * p_s_tb,
-              int n_h
-            ) {
-  struct buffer_head  * p_s_new_bh,
-    		      *	p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h);
-  b_blocknr_t	      *	p_n_blocknr,
-    			a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, };
-  int       		n_counter,
-   			n_number_of_freeblk,
-                	n_amount_needed,/* number of needed empty blocks */
-			n_retval = CARRY_ON;
-  struct super_block *	p_s_sb = p_s_tb->tb_sb;
-
-
-  /* number_of_freeblk is the number of empty blocks which have been
-     acquired for use by the balancing algorithm minus the number of
-     empty blocks used in the previous levels of the analysis,
-     number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
-     after empty blocks are acquired, and the balancing analysis is
-     then restarted, amount_needed is the number needed by this level
-     (n_h) of the balancing analysis.
-			    
-     Note that for systems with many processes writing, it would be
-     more layout optimal to calculate the total number needed by all
-     levels and then to run reiserfs_new_blocks to get all of them at once.  */
-
-  /* Initiate number_of_freeblk to the amount acquired prior to the restart of
-     the analysis or 0 if not restarted, then subtract the amount needed
-     by all of the levels of the tree below n_h. */
-  /* blknum includes S[n_h], so we subtract 1 in this calculation */
-  for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ )
-    n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0;
-
-  /* Allocate missing empty blocks. */
-  /* if p_s_Sh == 0  then we are getting a new root */
-  n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1;
-  /*  Amount_needed = the amount that we need more than the amount that we have. */
-  if ( n_amount_needed > n_number_of_freeblk )
-    n_amount_needed -= n_number_of_freeblk;
-  else /* If we have enough already then there is nothing to do. */
-    return CARRY_ON;
-
-  /* No need to check quota - is not allocated for blocks used for formatted nodes */
-  if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs,
-                                   n_amount_needed) == NO_DISK_SPACE)
-    return NO_DISK_SPACE;
-
-  /* for each blocknumber we just got, get a buffer and stick it on FEB */
-  for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed;
-	p_n_blocknr++, n_counter++ ) { 
-
-    RFALSE( ! *p_n_blocknr,
-	    "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
-
-    p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
-    RFALSE (buffer_dirty (p_s_new_bh) ||
-	    buffer_journaled (p_s_new_bh) ||
-	    buffer_journal_dirty (p_s_new_bh),
-	    "PAP-8140: journlaled or dirty buffer %b for the new block", 
-	    p_s_new_bh);
-    
-    /* Put empty buffers into the array. */
-    RFALSE (p_s_tb->FEB[p_s_tb->cur_blknum],
-	    "PAP-8141: busy slot for new buffer");
-
-    set_buffer_journal_new (p_s_new_bh);
-    p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
-  }
-
-  if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) )
-    n_retval = REPEAT_SEARCH ;
-
-  return n_retval;
-}
+static int get_empty_nodes(struct tree_balance *p_s_tb, int n_h)
+{
+	struct buffer_head *p_s_new_bh,
+	    *p_s_Sh = PATH_H_PBUFFER(p_s_tb->tb_path, n_h);
+	b_blocknr_t *p_n_blocknr, a_n_blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
+	int n_counter, n_number_of_freeblk, n_amount_needed,	/* number of needed empty blocks */
+	 n_retval = CARRY_ON;
+	struct super_block *p_s_sb = p_s_tb->tb_sb;
+
+	/* number_of_freeblk is the number of empty blocks which have been
+	   acquired for use by the balancing algorithm minus the number of
+	   empty blocks used in the previous levels of the analysis,
+	   number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs
+	   after empty blocks are acquired, and the balancing analysis is
+	   then restarted, amount_needed is the number needed by this level
+	   (n_h) of the balancing analysis.
+
+	   Note that for systems with many processes writing, it would be
+	   more layout optimal to calculate the total number needed by all
+	   levels and then to run reiserfs_new_blocks to get all of them at once.  */
+
+	/* Initiate number_of_freeblk to the amount acquired prior to the restart of
+	   the analysis or 0 if not restarted, then subtract the amount needed
+	   by all of the levels of the tree below n_h. */
+	/* blknum includes S[n_h], so we subtract 1 in this calculation */
+	for (n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum;
+	     n_counter < n_h; n_counter++)
+		n_number_of_freeblk -=
+		    (p_s_tb->blknum[n_counter]) ? (p_s_tb->blknum[n_counter] -
+						   1) : 0;
+
+	/* Allocate missing empty blocks. */
+	/* if p_s_Sh == 0  then we are getting a new root */
+	n_amount_needed = (p_s_Sh) ? (p_s_tb->blknum[n_h] - 1) : 1;
+	/*  Amount_needed = the amount that we need more than the amount that we have. */
+	if (n_amount_needed > n_number_of_freeblk)
+		n_amount_needed -= n_number_of_freeblk;
+	else			/* If we have enough already then there is nothing to do. */
+		return CARRY_ON;
+
+	/* No need to check quota - is not allocated for blocks used for formatted nodes */
+	if (reiserfs_new_form_blocknrs(p_s_tb, a_n_blocknrs,
+				       n_amount_needed) == NO_DISK_SPACE)
+		return NO_DISK_SPACE;
+
+	/* for each blocknumber we just got, get a buffer and stick it on FEB */
+	for (p_n_blocknr = a_n_blocknrs, n_counter = 0;
+	     n_counter < n_amount_needed; p_n_blocknr++, n_counter++) {
+
+		RFALSE(!*p_n_blocknr,
+		       "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
+
+		p_s_new_bh = sb_getblk(p_s_sb, *p_n_blocknr);
+		RFALSE(buffer_dirty(p_s_new_bh) ||
+		       buffer_journaled(p_s_new_bh) ||
+		       buffer_journal_dirty(p_s_new_bh),
+		       "PAP-8140: journlaled or dirty buffer %b for the new block",
+		       p_s_new_bh);
+
+		/* Put empty buffers into the array. */
+		RFALSE(p_s_tb->FEB[p_s_tb->cur_blknum],
+		       "PAP-8141: busy slot for new buffer");
+
+		set_buffer_journal_new(p_s_new_bh);
+		p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh;
+	}
+
+	if (n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB(p_s_tb))
+		n_retval = REPEAT_SEARCH;
 
+	return n_retval;
+}
 
 /* Get free space of the left neighbor, which is stored in the parent
  * node of the left neighbor.  */
-static int get_lfree (struct tree_balance * tb, int h)
+static int get_lfree(struct tree_balance *tb, int h)
 {
-    struct buffer_head * l, * f;
-    int order;
+	struct buffer_head *l, *f;
+	int order;
 
-    if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
-	return 0;
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0)
+		return 0;
 
-    if (f == l)
-	order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1;
-    else {
-	order = B_NR_ITEMS (l);
-	f = l;
-    }
+	if (f == l)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
+	else {
+		order = B_NR_ITEMS(l);
+		f = l;
+	}
 
-    return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f,order)));
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
 }
 
-
 /* Get free space of the right neighbor,
  * which is stored in the parent node of the right neighbor.
  */
-static int get_rfree (struct tree_balance * tb, int h)
+static int get_rfree(struct tree_balance *tb, int h)
 {
-  struct buffer_head * r, * f;
-  int order;
+	struct buffer_head *r, *f;
+	int order;
 
-  if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
-    return 0;
+	if ((f = PATH_H_PPARENT(tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0)
+		return 0;
 
-  if (f == r)
-      order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1;
-  else {
-      order = 0;
-      f = r;
-  }
+	if (f == r)
+		order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
+	else {
+		order = 0;
+		f = r;
+	}
 
-  return (MAX_CHILD_SIZE(f) - dc_size( B_N_CHILD(f,order)));
+	return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
 
 }
 
-
 /* Check whether left neighbor is in memory. */
-static int  is_left_neighbor_in_cache(
-              struct tree_balance * p_s_tb,
-              int                   n_h
-            ) {
-  struct buffer_head  * p_s_father, * left;
-  struct super_block  * p_s_sb = p_s_tb->tb_sb;
-  b_blocknr_t		n_left_neighbor_blocknr;
-  int                   n_left_neighbor_position;
-
-  if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */
-    return 0;
-
-  /* Calculate father of the node to be balanced. */
-  p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
-
-  RFALSE( ! p_s_father || 
-	  ! B_IS_IN_TREE (p_s_father) || 
-	  ! B_IS_IN_TREE (p_s_tb->FL[n_h]) ||
-	  ! buffer_uptodate (p_s_father) || 
-	  ! buffer_uptodate (p_s_tb->FL[n_h]),
-	  "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", 
-	  p_s_father, p_s_tb->FL[n_h]);
-
-
-  /* Get position of the pointer to the left neighbor into the left father. */
-  n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ?
-                      p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
-  /* Get left neighbor block number. */
-  n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
-  /* Look for the left neighbor in the cache. */
-  if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
-
-    RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
-	    "vs-8170: left neighbor (%b %z) is not in the tree", left, left);
-    put_bh(left) ;
-    return 1;
-  }
-
-  return 0;
-}
+static int is_left_neighbor_in_cache(struct tree_balance *p_s_tb, int n_h)
+{
+	struct buffer_head *p_s_father, *left;
+	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	b_blocknr_t n_left_neighbor_blocknr;
+	int n_left_neighbor_position;
+
+	if (!p_s_tb->FL[n_h])	/* Father of the left neighbor does not exist. */
+		return 0;
+
+	/* Calculate father of the node to be balanced. */
+	p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1);
+
+	RFALSE(!p_s_father ||
+	       !B_IS_IN_TREE(p_s_father) ||
+	       !B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
+	       !buffer_uptodate(p_s_father) ||
+	       !buffer_uptodate(p_s_tb->FL[n_h]),
+	       "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
+	       p_s_father, p_s_tb->FL[n_h]);
+
+	/* Get position of the pointer to the left neighbor into the left father. */
+	n_left_neighbor_position = (p_s_father == p_s_tb->FL[n_h]) ?
+	    p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->FL[n_h]);
+	/* Get left neighbor block number. */
+	n_left_neighbor_blocknr =
+	    B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
+	/* Look for the left neighbor in the cache. */
+	if ((left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr))) {
+
+		RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
+		       "vs-8170: left neighbor (%b %z) is not in the tree",
+		       left, left);
+		put_bh(left);
+		return 1;
+	}
 
+	return 0;
+}
 
 #define LEFT_PARENTS  'l'
 #define RIGHT_PARENTS 'r'
 
-
-static void decrement_key (struct cpu_key * p_s_key)
+static void decrement_key(struct cpu_key *p_s_key)
 {
-    // call item specific function for this key
-    item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key);
+	// call item specific function for this key
+	item_ops[cpu_key_k_type(p_s_key)]->decrement_key(p_s_key);
 }
 
-
-
-
 /* Calculate far left/right parent of the left/right neighbor of the current node, that
  * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h].
  * Calculate left/right common parent of the current node and L[h]/R[h].
@@ -940,111 +953,121 @@ static void decrement_key (struct cpu_key * p_s_key)
  		SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON         - schedule didn't occur while the function worked;
  */
-static int  get_far_parent (struct tree_balance *   p_s_tb,
-			    int                     n_h,
-			    struct buffer_head  **  pp_s_father,
-			    struct buffer_head  **  pp_s_com_father,
-			    char                    c_lr_par) 
+static int get_far_parent(struct tree_balance *p_s_tb,
+			  int n_h,
+			  struct buffer_head **pp_s_father,
+			  struct buffer_head **pp_s_com_father, char c_lr_par)
 {
-    struct buffer_head  * p_s_parent;
-    INITIALIZE_PATH (s_path_to_neighbor_father);
-    struct path * p_s_path = p_s_tb->tb_path;
-    struct cpu_key	s_lr_father_key;
-    int                   n_counter,
-	n_position = INT_MAX,
-	n_first_last_position = 0,
-	n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
-
-    /* Starting from F[n_h] go upwards in the tree, and look for the common
-      ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
-
-    n_counter = n_path_offset;
-
-    RFALSE( n_counter < FIRST_PATH_ELEMENT_OFFSET,
-	    "PAP-8180: invalid path length");
-
-  
-    for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--  )  {
-	/* Check whether parent of the current buffer in the path is really parent in the tree. */
-	if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) )
-	    return REPEAT_SEARCH;
-	/* Check whether position in the parent is correct. */
-	if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) )
-	    return REPEAT_SEARCH;
-	/* Check whether parent at the path really points to the child. */
-	if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
-	     PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr )
-	    return REPEAT_SEARCH;
-	/* Return delimiting key if position in the parent is not equal to first/last one. */
-	if ( c_lr_par == RIGHT_PARENTS )
-	    n_first_last_position = B_NR_ITEMS (p_s_parent);
-	if ( n_position != n_first_last_position ) {
-	    *pp_s_com_father = p_s_parent;
-	    get_bh(*pp_s_com_father) ;
-	    /*(*pp_s_com_father = p_s_parent)->b_count++;*/
-	    break;
+	struct buffer_head *p_s_parent;
+	INITIALIZE_PATH(s_path_to_neighbor_father);
+	struct path *p_s_path = p_s_tb->tb_path;
+	struct cpu_key s_lr_father_key;
+	int n_counter,
+	    n_position = INT_MAX,
+	    n_first_last_position = 0,
+	    n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h);
+
+	/* Starting from F[n_h] go upwards in the tree, and look for the common
+	   ancestor of F[n_h], and its neighbor l/r, that should be obtained. */
+
+	n_counter = n_path_offset;
+
+	RFALSE(n_counter < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-8180: invalid path length");
+
+	for (; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter--) {
+		/* Check whether parent of the current buffer in the path is really parent in the tree. */
+		if (!B_IS_IN_TREE
+		    (p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)))
+			return REPEAT_SEARCH;
+		/* Check whether position in the parent is correct. */
+		if ((n_position =
+		     PATH_OFFSET_POSITION(p_s_path,
+					  n_counter - 1)) >
+		    B_NR_ITEMS(p_s_parent))
+			return REPEAT_SEARCH;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr)
+			return REPEAT_SEARCH;
+		/* Return delimiting key if position in the parent is not equal to first/last one. */
+		if (c_lr_par == RIGHT_PARENTS)
+			n_first_last_position = B_NR_ITEMS(p_s_parent);
+		if (n_position != n_first_last_position) {
+			*pp_s_com_father = p_s_parent;
+			get_bh(*pp_s_com_father);
+			/*(*pp_s_com_father = p_s_parent)->b_count++; */
+			break;
+		}
 	}
-    }
-
-    /* if we are in the root of the tree, then there is no common father */
-    if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) {
-	/* Check whether first buffer in the path is the root of the tree. */
-	if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-	     SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
-	    *pp_s_father = *pp_s_com_father = NULL;
-	    return CARRY_ON;
+
+	/* if we are in the root of the tree, then there is no common father */
+	if (n_counter == FIRST_PATH_ELEMENT_OFFSET) {
+		/* Check whether first buffer in the path is the root of the tree. */
+		if (PATH_OFFSET_PBUFFER
+		    (p_s_tb->tb_path,
+		     FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
+		    SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
+			*pp_s_father = *pp_s_com_father = NULL;
+			return CARRY_ON;
+		}
+		return REPEAT_SEARCH;
 	}
-	return REPEAT_SEARCH;
-    }
 
-    RFALSE( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
-	    "PAP-8185: (%b %z) level too small", 
-	    *pp_s_com_father, *pp_s_com_father);
+	RFALSE(B_LEVEL(*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL,
+	       "PAP-8185: (%b %z) level too small",
+	       *pp_s_com_father, *pp_s_com_father);
 
-    /* Check whether the common parent is locked. */
+	/* Check whether the common parent is locked. */
 
-    if ( buffer_locked (*pp_s_com_father) ) {
-	__wait_on_buffer(*pp_s_com_father);
-	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
-	    decrement_bcount(*pp_s_com_father);
-	    return REPEAT_SEARCH;
+	if (buffer_locked(*pp_s_com_father)) {
+		__wait_on_buffer(*pp_s_com_father);
+		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			decrement_bcount(*pp_s_com_father);
+			return REPEAT_SEARCH;
+		}
 	}
-    }
-
-    /* So, we got common parent of the current node and its left/right neighbor.
-     Now we are geting the parent of the left/right neighbor. */
 
-    /* Form key to get parent of the left/right neighbor. */
-    le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ?
-						     (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position)));
+	/* So, we got common parent of the current node and its left/right neighbor.
+	   Now we are geting the parent of the left/right neighbor. */
 
+	/* Form key to get parent of the left/right neighbor. */
+	le_key2cpu_key(&s_lr_father_key,
+		       B_N_PDELIM_KEY(*pp_s_com_father,
+				      (c_lr_par ==
+				       LEFT_PARENTS) ? (p_s_tb->lkey[n_h - 1] =
+							n_position -
+							1) : (p_s_tb->rkey[n_h -
+									   1] =
+							      n_position)));
 
-    if ( c_lr_par == LEFT_PARENTS )
-	decrement_key(&s_lr_father_key);
+	if (c_lr_par == LEFT_PARENTS)
+		decrement_key(&s_lr_father_key);
 
-    if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR)
-	// path is released
-	return IO_ERROR;
+	if (search_by_key
+	    (p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
+	     n_h + 1) == IO_ERROR)
+		// path is released
+		return IO_ERROR;
 
-    if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
-	decrement_counters_in_path(&s_path_to_neighbor_father);
-	decrement_bcount(*pp_s_com_father);
-	return REPEAT_SEARCH;
-    }
+	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+		decrement_counters_in_path(&s_path_to_neighbor_father);
+		decrement_bcount(*pp_s_com_father);
+		return REPEAT_SEARCH;
+	}
 
-    *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
+	*pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
 
-    RFALSE( B_LEVEL (*pp_s_father) != n_h + 1,
-	    "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
-    RFALSE( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET,
-	    "PAP-8192: path length is too small");
+	RFALSE(B_LEVEL(*pp_s_father) != n_h + 1,
+	       "PAP-8190: (%b %z) level too small", *pp_s_father, *pp_s_father);
+	RFALSE(s_path_to_neighbor_father.path_length <
+	       FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
 
-    s_path_to_neighbor_father.path_length--;
-    decrement_counters_in_path(&s_path_to_neighbor_father);
-    return CARRY_ON;
+	s_path_to_neighbor_father.path_length--;
+	decrement_counters_in_path(&s_path_to_neighbor_father);
+	return CARRY_ON;
 }
 
-
 /* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of
  * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset],
  * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset].
@@ -1052,122 +1075,127 @@ static int  get_far_parent (struct tree_balance *   p_s_tb,
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int  get_parents (struct tree_balance * p_s_tb, int n_h)
+static int get_parents(struct tree_balance *p_s_tb, int n_h)
 {
-    struct path         * p_s_path = p_s_tb->tb_path;
-    int                   n_position,
-	n_ret_value,
-	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
-    struct buffer_head  * p_s_curf,
-	* p_s_curcf;
-
-    /* Current node is the root of the tree or will be root of the tree */
-    if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
-	/* The root can not have parents.
-	   Release nodes which previously were obtained as parents of the current node neighbors. */
+	struct path *p_s_path = p_s_tb->tb_path;
+	int n_position,
+	    n_ret_value,
+	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+	struct buffer_head *p_s_curf, *p_s_curcf;
+
+	/* Current node is the root of the tree or will be root of the tree */
+	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+		/* The root can not have parents.
+		   Release nodes which previously were obtained as parents of the current node neighbors. */
+		decrement_bcount(p_s_tb->FL[n_h]);
+		decrement_bcount(p_s_tb->CFL[n_h]);
+		decrement_bcount(p_s_tb->FR[n_h]);
+		decrement_bcount(p_s_tb->CFR[n_h]);
+		p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] =
+		    p_s_tb->CFR[n_h] = NULL;
+		return CARRY_ON;
+	}
+
+	/* Get parent FL[n_path_offset] of L[n_path_offset]. */
+	if ((n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1))) {
+		/* Current node is not the first child of its parent. */
+		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
+		p_s_curf = p_s_curcf =
+		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
+		get_bh(p_s_curf);
+		get_bh(p_s_curf);
+		p_s_tb->lkey[n_h] = n_position - 1;
+	} else {
+		/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
+		   Calculate current common parent of L[n_path_offset] and the current node. Note that
+		   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
+		   Calculate lkey[n_path_offset]. */
+		if ((n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
+						  &p_s_curcf,
+						  LEFT_PARENTS)) != CARRY_ON)
+			return n_ret_value;
+	}
+
 	decrement_bcount(p_s_tb->FL[n_h]);
+	p_s_tb->FL[n_h] = p_s_curf;	/* New initialization of FL[n_h]. */
 	decrement_bcount(p_s_tb->CFL[n_h]);
-	decrement_bcount(p_s_tb->FR[n_h]);
-	decrement_bcount(p_s_tb->CFR[n_h]);
-	p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL;
-	return CARRY_ON;
-    }
-  
-    /* Get parent FL[n_path_offset] of L[n_path_offset]. */
-    if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) )  {
-	/* Current node is not the first child of its parent. */
-	/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
-	p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-	get_bh(p_s_curf) ;
-	get_bh(p_s_curf) ;
-	p_s_tb->lkey[n_h] = n_position - 1;
-    }
-    else  {
-	/* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node.
-	   Calculate current common parent of L[n_path_offset] and the current node. Note that
-	   CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset].
-	   Calculate lkey[n_path_offset]. */
-	if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,
-					   &p_s_curcf, LEFT_PARENTS)) != CARRY_ON )
-	    return n_ret_value;
-    }
-
-    decrement_bcount(p_s_tb->FL[n_h]);
-    p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */
-    decrement_bcount(p_s_tb->CFL[n_h]);
-    p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */
-
-    RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) || 
-	    (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
-	    "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
+	p_s_tb->CFL[n_h] = p_s_curcf;	/* New initialization of CFL[n_h]. */
+
+	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
+	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
+	       "PAP-8195: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf);
 
 /* Get parent FR[n_h] of R[n_h]. */
 
 /* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */
-    if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) {
+	if (n_position == B_NR_ITEMS(PATH_H_PBUFFER(p_s_path, n_h + 1))) {
 /* Calculate current parent of R[n_h], which is the right neighbor of F[n_h].
    Calculate current common parent of R[n_h] and current node. Note that CFR[n_h]
    not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */
-	if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf,  &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON )
-	    return n_ret_value;
-    }
-    else {
+		if ((n_ret_value =
+		     get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf,
+				    RIGHT_PARENTS)) != CARRY_ON)
+			return n_ret_value;
+	} else {
 /* Current node is not the last child of its parent F[n_h]. */
-	/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/
-	p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
-	get_bh(p_s_curf) ;
-	get_bh(p_s_curf) ;
-	p_s_tb->rkey[n_h] = n_position;
-    }	
-
-    decrement_bcount(p_s_tb->FR[n_h]);
-    p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */
-    
-    decrement_bcount(p_s_tb->CFR[n_h]);
-    p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */
-
-    RFALSE( (p_s_curf && !B_IS_IN_TREE (p_s_curf)) ||
-            (p_s_curcf && !B_IS_IN_TREE (p_s_curcf)),
-	    "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
-
-    return CARRY_ON;
-}
+		/*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2; */
+		p_s_curf = p_s_curcf =
+		    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1);
+		get_bh(p_s_curf);
+		get_bh(p_s_curf);
+		p_s_tb->rkey[n_h] = n_position;
+	}
 
+	decrement_bcount(p_s_tb->FR[n_h]);
+	p_s_tb->FR[n_h] = p_s_curf;	/* New initialization of FR[n_path_offset]. */
+
+	decrement_bcount(p_s_tb->CFR[n_h]);
+	p_s_tb->CFR[n_h] = p_s_curcf;	/* New initialization of CFR[n_path_offset]. */
+
+	RFALSE((p_s_curf && !B_IS_IN_TREE(p_s_curf)) ||
+	       (p_s_curcf && !B_IS_IN_TREE(p_s_curcf)),
+	       "PAP-8205: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf);
+
+	return CARRY_ON;
+}
 
 /* it is possible to remove node as result of shiftings to
    neighbors even when we insert or paste item. */
-static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h)
+static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
+				      struct tree_balance *tb, int h)
 {
-    struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h);
-    int levbytes = tb->insert_size[h];
-    struct item_head * ih;
-    struct reiserfs_key * r_key = NULL;
-
-    ih = B_N_PITEM_HEAD (Sh, 0);
-    if ( tb->CFR[h] )
-	r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]);
-  
-    if (
-	lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
-	/* shifting may merge items which might save space */
-	- (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0)
-	- (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0)
-	+ (( h ) ? KEY_SIZE : 0))
-    {
-	/* node can not be removed */
-	if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */
-	    if ( ! h )
-		tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0);
-	    set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-	    return NO_BALANCING_NEEDED;
+	struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	int levbytes = tb->insert_size[h];
+	struct item_head *ih;
+	struct reiserfs_key *r_key = NULL;
+
+	ih = B_N_PITEM_HEAD(Sh, 0);
+	if (tb->CFR[h])
+		r_key = B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]);
+
+	if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
+	    /* shifting may merge items which might save space */
+	    -
+	    ((!h
+	      && op_is_left_mergeable(&(ih->ih_key), Sh->b_size)) ? IH_SIZE : 0)
+	    -
+	    ((!h && r_key
+	      && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
+	    + ((h) ? KEY_SIZE : 0)) {
+		/* node can not be removed */
+		if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
+			if (!h)
+				tb->s0num =
+				    B_NR_ITEMS(Sh) +
+				    ((mode == M_INSERT) ? 1 : 0);
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;
+		}
 	}
-    }
-    PROC_INFO_INC( tb -> tb_sb, can_node_be_removed[ h ] );
-    return !NO_BALANCING_NEEDED;
+	PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
+	return !NO_BALANCING_NEEDED;
 }
 
-
-
 /* Check whether current node S[h] is balanced when increasing its size by
  * Inserting or Pasting.
  * Calculate parameters for balancing for current level h.
@@ -1182,154 +1210,157 @@ static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree
  *	       -2 - no disk space.
  */
 /* ip means Inserting or Pasting */
-static int ip_check_balance (struct tree_balance * tb, int h)
+static int ip_check_balance(struct tree_balance *tb, int h)
 {
-    struct virtual_node * vn = tb->tb_vn;
-    int levbytes,  /* Number of bytes that must be inserted into (value
-		      is negative if bytes are deleted) buffer which
-		      contains node being balanced.  The mnemonic is
-		      that the attempted change in node space used level
-		      is levbytes bytes. */
-	n_ret_value;
-
-    int lfree, sfree, rfree /* free space in L, S and R */;
-
-    /* nver is short for number of vertixes, and lnver is the number if
-       we shift to the left, rnver is the number if we shift to the
-       right, and lrnver is the number if we shift in both directions.
-       The goal is to minimize first the number of vertixes, and second,
-       the number of vertixes whose contents are changed by shifting,
-       and third the number of uncached vertixes whose contents are
-       changed by shifting and must be read from disk.  */
-    int nver, lnver, rnver, lrnver;
-
-    /* used at leaf level only, S0 = S[0] is the node being balanced,
-       sInum [ I = 0,1,2 ] is the number of items that will
-       remain in node SI after balancing.  S1 and S2 are new
-       nodes that might be created. */
-  
-    /* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
-       where 4th parameter is s1bytes and 5th - s2bytes
-    */
-    short snum012[40] = {0,};	/* s0num, s1num, s2num for 8 cases 
-				   0,1 - do not shift and do not shift but bottle
-				   2 - shift only whole item to left
-				   3 - shift to left and bottle as much as possible
-				   4,5 - shift to right	(whole items and as much as possible
-				   6,7 - shift to both directions (whole items and as much as possible)
-				*/
-
-    /* Sh is the node whose balance is currently being checked */
-    struct buffer_head * Sh;
-  
-    Sh = PATH_H_PBUFFER (tb->tb_path, h);
-    levbytes = tb->insert_size[h];
-  
-    /* Calculate balance parameters for creating new root. */
-    if ( ! Sh )  {
-	if ( ! h )
-	    reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0");
-	switch ( n_ret_value = get_empty_nodes (tb, h) )  {
-	case CARRY_ON:
-	    set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-	    return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
-
-	case NO_DISK_SPACE:
-	case REPEAT_SEARCH:
-	    return n_ret_value;
-	default:   
-	    reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
+	struct virtual_node *vn = tb->tb_vn;
+	int levbytes,		/* Number of bytes that must be inserted into (value
+				   is negative if bytes are deleted) buffer which
+				   contains node being balanced.  The mnemonic is
+				   that the attempted change in node space used level
+				   is levbytes bytes. */
+	 n_ret_value;
+
+	int lfree, sfree, rfree /* free space in L, S and R */ ;
+
+	/* nver is short for number of vertixes, and lnver is the number if
+	   we shift to the left, rnver is the number if we shift to the
+	   right, and lrnver is the number if we shift in both directions.
+	   The goal is to minimize first the number of vertixes, and second,
+	   the number of vertixes whose contents are changed by shifting,
+	   and third the number of uncached vertixes whose contents are
+	   changed by shifting and must be read from disk.  */
+	int nver, lnver, rnver, lrnver;
+
+	/* used at leaf level only, S0 = S[0] is the node being balanced,
+	   sInum [ I = 0,1,2 ] is the number of items that will
+	   remain in node SI after balancing.  S1 and S2 are new
+	   nodes that might be created. */
+
+	/* we perform 8 calls to get_num_ver().  For each call we calculate five parameters.
+	   where 4th parameter is s1bytes and 5th - s2bytes
+	 */
+	short snum012[40] = { 0, };	/* s0num, s1num, s2num for 8 cases 
+					   0,1 - do not shift and do not shift but bottle
+					   2 - shift only whole item to left
+					   3 - shift to left and bottle as much as possible
+					   4,5 - shift to right (whole items and as much as possible
+					   6,7 - shift to both directions (whole items and as much as possible)
+					 */
+
+	/* Sh is the node whose balance is currently being checked */
+	struct buffer_head *Sh;
+
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	levbytes = tb->insert_size[h];
+
+	/* Calculate balance parameters for creating new root. */
+	if (!Sh) {
+		if (!h)
+			reiserfs_panic(tb->tb_sb,
+				       "vs-8210: ip_check_balance: S[0] can not be 0");
+		switch (n_ret_value = get_empty_nodes(tb, h)) {
+		case CARRY_ON:
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
+
+		case NO_DISK_SPACE:
+		case REPEAT_SEARCH:
+			return n_ret_value;
+		default:
+			reiserfs_panic(tb->tb_sb,
+				       "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes");
+		}
 	}
-    }
-  
-    if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */
-	return n_ret_value;
-  
-    sfree = B_FREE_SPACE (Sh);
-
-    /* get free space of neighbors */
-    rfree = get_rfree (tb, h);
-    lfree = get_lfree (tb, h);
-
-    if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED)
-	/* and new item fits into node S[h] without any shifting */
-	return NO_BALANCING_NEEDED;
-     
-    create_virtual_node (tb, h);
-
-    /*	
-	determine maximal number of items we can shift to the left neighbor (in tb structure)
-	and the maximal number of bytes that can flow to the left neighbor
-	from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
-    */
-    check_left (tb, h, lfree);
-
-    /*
-      determine maximal number of items we can shift to the right neighbor (in tb structure)
-      and the maximal number of bytes that can flow to the right neighbor
-      from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
-    */
-    check_right (tb, h, rfree);
-
-
-    /* all contents of internal node S[h] can be moved into its
-       neighbors, S[h] will be removed after balancing */
-    if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
-	int to_r; 
-       
-	/* Since we are working on internal nodes, and our internal
-	   nodes have fixed size entries, then we can balance by the
-	   number of items rather than the space they consume.  In this
-	   routine we set the left node equal to the right node,
-	   allowing a difference of less than or equal to 1 child
-	   pointer. */
-	to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - 
-	    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
-	set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
-	return CARRY_ON;
-    }
-
-    /* this checks balance condition, that any two neighboring nodes can not fit in one node */
-    RFALSE( h && 
-	    ( tb->lnum[h] >= vn->vn_nr_item + 1 || 
-	      tb->rnum[h] >= vn->vn_nr_item + 1),
-	    "vs-8220: tree is not balanced on internal level");
-    RFALSE( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
-		    (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) ),
-	    "vs-8225: tree is not balanced on leaf level");
-
-    /* all contents of S[0] can be moved into its neighbors
-       S[0] will be removed after balancing. */
-    if (!h && is_leaf_removable (tb))
-	return CARRY_ON;
 
+	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)	/* get parents of S[h] neighbors. */
+		return n_ret_value;
 
-    /* why do we perform this check here rather than earlier??
-       Answer: we can win 1 node in some cases above. Moreover we
-       checked it above, when we checked, that S[0] is not removable
-       in principle */
-    if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */
-	if ( ! h )
-	    tb->s0num = vn->vn_nr_item;
-	set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-	return NO_BALANCING_NEEDED;
-    }
+	sfree = B_FREE_SPACE(Sh);
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
+	    NO_BALANCING_NEEDED)
+		/* and new item fits into node S[h] without any shifting */
+		return NO_BALANCING_NEEDED;
 
+	create_virtual_node(tb, h);
 
-    {
-	int lpar, rpar, nset, lset, rset, lrset;
-	/* 
-	 * regular overflowing of the node
+	/*  
+	   determine maximal number of items we can shift to the left neighbor (in tb structure)
+	   and the maximal number of bytes that can flow to the left neighbor
+	   from the left most liquid item that cannot be shifted from S[0] entirely (returned value)
 	 */
+	check_left(tb, h, lfree);
 
-	/* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
-	   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
-	   nset, lset, rset, lrset - shows, whether flowing items give better packing 
-	*/
+	/*
+	   determine maximal number of items we can shift to the right neighbor (in tb structure)
+	   and the maximal number of bytes that can flow to the right neighbor
+	   from the right most liquid item that cannot be shifted from S[0] entirely (returned value)
+	 */
+	check_right(tb, h, rfree);
+
+	/* all contents of internal node S[h] can be moved into its
+	   neighbors, S[h] will be removed after balancing */
+	if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
+		int to_r;
+
+		/* Since we are working on internal nodes, and our internal
+		   nodes have fixed size entries, then we can balance by the
+		   number of items rather than the space they consume.  In this
+		   routine we set the left node equal to the right node,
+		   allowing a difference of less than or equal to 1 child
+		   pointer. */
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
+
+	/* this checks balance condition, that any two neighboring nodes can not fit in one node */
+	RFALSE(h &&
+	       (tb->lnum[h] >= vn->vn_nr_item + 1 ||
+		tb->rnum[h] >= vn->vn_nr_item + 1),
+	       "vs-8220: tree is not balanced on internal level");
+	RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
+		      (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
+	       "vs-8225: tree is not balanced on leaf level");
+
+	/* all contents of S[0] can be moved into its neighbors
+	   S[0] will be removed after balancing. */
+	if (!h && is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/* why do we perform this check here rather than earlier??
+	   Answer: we can win 1 node in some cases above. Moreover we
+	   checked it above, when we checked, that S[0] is not removable
+	   in principle */
+	if (sfree >= levbytes) {	/* new item fits into node S[h] without any shifting */
+		if (!h)
+			tb->s0num = vn->vn_nr_item;
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	{
+		int lpar, rpar, nset, lset, rset, lrset;
+		/* 
+		 * regular overflowing of the node
+		 */
+
+		/* get_num_ver works in 2 modes (FLOW & NO_FLOW) 
+		   lpar, rpar - number of items we can shift to left/right neighbor (including splitting item)
+		   nset, lset, rset, lrset - shows, whether flowing items give better packing 
+		 */
 #define FLOW 1
-#define NO_FLOW 0	/* do not any splitting */
+#define NO_FLOW 0		/* do not any splitting */
 
-	/* we choose one the following */
+		/* we choose one the following */
 #define NOTHING_SHIFT_NO_FLOW	0
 #define NOTHING_SHIFT_FLOW	5
 #define LEFT_SHIFT_NO_FLOW	10
@@ -1339,164 +1370,173 @@ static int ip_check_balance (struct tree_balance * tb, int h)
 #define LR_SHIFT_NO_FLOW	30
 #define LR_SHIFT_FLOW		35
 
+		lpar = tb->lnum[h];
+		rpar = tb->rnum[h];
+
+		/* calculate number of blocks S[h] must be split into when
+		   nothing is shifted to the neighbors,
+		   as well as number of items in each part of the split node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
+		nset = NOTHING_SHIFT_NO_FLOW;
+		nver = get_num_ver(vn->vn_mode, tb, h,
+				   0, -1, h ? vn->vn_nr_item : 0, -1,
+				   snum012, NO_FLOW);
+
+		if (!h) {
+			int nver1;
+
+			/* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
+			nver1 = get_num_ver(vn->vn_mode, tb, h,
+					    0, -1, 0, -1,
+					    snum012 + NOTHING_SHIFT_FLOW, FLOW);
+			if (nver > nver1)
+				nset = NOTHING_SHIFT_FLOW, nver = nver1;
+		}
 
-	lpar = tb->lnum[h];
-	rpar = tb->rnum[h];
-
-
-	/* calculate number of blocks S[h] must be split into when
-	   nothing is shifted to the neighbors,
-	   as well as number of items in each part of the split node (s012 numbers),
-	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any */
-	nset = NOTHING_SHIFT_NO_FLOW;
-	nver = get_num_ver (vn->vn_mode, tb, h,
-			    0, -1, h?vn->vn_nr_item:0, -1, 
-			    snum012, NO_FLOW);
-
-	if (!h)
-	{
-	    int nver1;
-
-	    /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */
-	    nver1 = get_num_ver (vn->vn_mode, tb, h, 
-				 0, -1, 0, -1, 
-				 snum012 + NOTHING_SHIFT_FLOW, FLOW);
-	    if (nver > nver1)
-		nset = NOTHING_SHIFT_FLOW, nver = nver1;
-	}
-       
- 
-	/* calculate number of blocks S[h] must be split into when
-	   l_shift_num first items and l_shift_bytes of the right most
-	   liquid item to be shifted are shifted to the left neighbor,
-	   as well as number of items in each part of the splitted node (s012 numbers),
-	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
-	*/
-	lset = LEFT_SHIFT_NO_FLOW;
-	lnver = get_num_ver (vn->vn_mode, tb, h, 
-			     lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1,
-			     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
-	if (!h)
-	{
-	    int lnver1;
-
-	    lnver1 = get_num_ver (vn->vn_mode, tb, h, 
-				  lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1,
-				  snum012 + LEFT_SHIFT_FLOW, FLOW);
-	    if (lnver > lnver1)
-		lset = LEFT_SHIFT_FLOW, lnver = lnver1;
-	}
-
-
-	/* calculate number of blocks S[h] must be split into when
-	   r_shift_num first items and r_shift_bytes of the left most
-	   liquid item to be shifted are shifted to the right neighbor,
-	   as well as number of items in each part of the splitted node (s012 numbers),
-	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
-	*/
-	rset = RIGHT_SHIFT_NO_FLOW;
-	rnver = get_num_ver (vn->vn_mode, tb, h, 
-			     0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1, 
-			     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
-	if (!h)
-	{
-	    int rnver1;
-
-	    rnver1 = get_num_ver (vn->vn_mode, tb, h, 
-				  0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, 
-				  snum012 + RIGHT_SHIFT_FLOW, FLOW);
-
-	    if (rnver > rnver1)
-		rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
-	}
-
-
-	/* calculate number of blocks S[h] must be split into when
-	   items are shifted in both directions,
-	   as well as number of items in each part of the splitted node (s012 numbers),
-	   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
-	*/
-	lrset = LR_SHIFT_NO_FLOW;
-	lrnver = get_num_ver (vn->vn_mode, tb, h, 
-			      lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1,
-			      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
-	if (!h)
-	{
-	    int lrnver1;
-
-	    lrnver1 = get_num_ver (vn->vn_mode, tb, h, 
-				   lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes,
-				   snum012 + LR_SHIFT_FLOW, FLOW);
-	    if (lrnver > lrnver1)
-		lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
-	}
-
-
+		/* calculate number of blocks S[h] must be split into when
+		   l_shift_num first items and l_shift_bytes of the right most
+		   liquid item to be shifted are shifted to the left neighbor,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		lset = LEFT_SHIFT_NO_FLOW;
+		lnver = get_num_ver(vn->vn_mode, tb, h,
+				    lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				    -1, h ? vn->vn_nr_item : 0, -1,
+				    snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lnver1;
+
+			lnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     lpar -
+					     ((tb->lbytes != -1) ? 1 : 0),
+					     tb->lbytes, 0, -1,
+					     snum012 + LEFT_SHIFT_FLOW, FLOW);
+			if (lnver > lnver1)
+				lset = LEFT_SHIFT_FLOW, lnver = lnver1;
+		}
 
-	/* Our general shifting strategy is:
-	   1) to minimized number of new nodes;
-	   2) to minimized number of neighbors involved in shifting;
-	   3) to minimized number of disk reads; */
+		/* calculate number of blocks S[h] must be split into when
+		   r_shift_num first items and r_shift_bytes of the left most
+		   liquid item to be shifted are shifted to the right neighbor,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		rset = RIGHT_SHIFT_NO_FLOW;
+		rnver = get_num_ver(vn->vn_mode, tb, h,
+				    0, -1,
+				    h ? (vn->vn_nr_item - rpar) : (rpar -
+								   ((tb->
+								     rbytes !=
+								     -1) ? 1 :
+								    0)), -1,
+				    snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int rnver1;
+
+			rnver1 = get_num_ver(vn->vn_mode, tb, h,
+					     0, -1,
+					     (rpar -
+					      ((tb->rbytes != -1) ? 1 : 0)),
+					     tb->rbytes,
+					     snum012 + RIGHT_SHIFT_FLOW, FLOW);
+
+			if (rnver > rnver1)
+				rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
+		}
 
-	/* we can win TWO or ONE nodes by shifting in both directions */
-	if (lrnver < lnver && lrnver < rnver)
-	{
-	    RFALSE( h && 
-		    (tb->lnum[h] != 1 || 
-		     tb->rnum[h] != 1 || 
-		     lrnver != 1 || rnver != 2 || lnver != 2 || h != 1),
-		    "vs-8230: bad h");
-	    if (lrset == LR_SHIFT_FLOW)
-		set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset,
-				tb->lbytes, tb->rbytes);
-	    else
-		set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), 
-				tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1);
-
-	    return CARRY_ON;
-	}
+		/* calculate number of blocks S[h] must be split into when
+		   items are shifted in both directions,
+		   as well as number of items in each part of the splitted node (s012 numbers),
+		   and number of bytes (s1bytes) of the shared drop which flow to S1 if any
+		 */
+		lrset = LR_SHIFT_NO_FLOW;
+		lrnver = get_num_ver(vn->vn_mode, tb, h,
+				     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
+				     -1,
+				     h ? (vn->vn_nr_item - rpar) : (rpar -
+								    ((tb->
+								      rbytes !=
+								      -1) ? 1 :
+								     0)), -1,
+				     snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
+		if (!h) {
+			int lrnver1;
+
+			lrnver1 = get_num_ver(vn->vn_mode, tb, h,
+					      lpar -
+					      ((tb->lbytes != -1) ? 1 : 0),
+					      tb->lbytes,
+					      (rpar -
+					       ((tb->rbytes != -1) ? 1 : 0)),
+					      tb->rbytes,
+					      snum012 + LR_SHIFT_FLOW, FLOW);
+			if (lrnver > lrnver1)
+				lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
+		}
 
-	/* if shifting doesn't lead to better packing then don't shift */
-	if (nver == lrnver)
-	{
-	    set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1);
-	    return CARRY_ON;
-	}
+		/* Our general shifting strategy is:
+		   1) to minimized number of new nodes;
+		   2) to minimized number of neighbors involved in shifting;
+		   3) to minimized number of disk reads; */
+
+		/* we can win TWO or ONE nodes by shifting in both directions */
+		if (lrnver < lnver && lrnver < rnver) {
+			RFALSE(h &&
+			       (tb->lnum[h] != 1 ||
+				tb->rnum[h] != 1 ||
+				lrnver != 1 || rnver != 2 || lnver != 2
+				|| h != 1), "vs-8230: bad h");
+			if (lrset == LR_SHIFT_FLOW)
+				set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
+					       lrnver, snum012 + lrset,
+					       tb->lbytes, tb->rbytes);
+			else
+				set_parameters(tb, h,
+					       tb->lnum[h] -
+					       ((tb->lbytes == -1) ? 0 : 1),
+					       tb->rnum[h] -
+					       ((tb->rbytes == -1) ? 0 : 1),
+					       lrnver, snum012 + lrset, -1, -1);
+
+			return CARRY_ON;
+		}
 
+		/* if shifting doesn't lead to better packing then don't shift */
+		if (nver == lrnver) {
+			set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
+				       -1);
+			return CARRY_ON;
+		}
 
-	/* now we know that for better packing shifting in only one
-	   direction either to the left or to the right is required */
+		/* now we know that for better packing shifting in only one
+		   direction either to the left or to the right is required */
 
-	/*  if shifting to the left is better than shifting to the right */
-	if (lnver < rnver)
-	{
-	    SET_PAR_SHIFT_LEFT;
-	    return CARRY_ON;
-	}
+		/*  if shifting to the left is better than shifting to the right */
+		if (lnver < rnver) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
 
-	/* if shifting to the right is better than shifting to the left */
-	if (lnver > rnver)
-	{
-	    SET_PAR_SHIFT_RIGHT;
-	    return CARRY_ON;
-	}
+		/* if shifting to the right is better than shifting to the left */
+		if (lnver > rnver) {
+			SET_PAR_SHIFT_RIGHT;
+			return CARRY_ON;
+		}
 
+		/* now shifting in either direction gives the same number
+		   of nodes and we can make use of the cached neighbors */
+		if (is_left_neighbor_in_cache(tb, h)) {
+			SET_PAR_SHIFT_LEFT;
+			return CARRY_ON;
+		}
 
-	/* now shifting in either direction gives the same number
-	   of nodes and we can make use of the cached neighbors */
-	if (is_left_neighbor_in_cache (tb,h))
-	{
-	    SET_PAR_SHIFT_LEFT;
-	    return CARRY_ON;
+		/* shift to the right independently on whether the right neighbor in cache or not */
+		SET_PAR_SHIFT_RIGHT;
+		return CARRY_ON;
 	}
-
-	/* shift to the right independently on whether the right neighbor in cache or not */
-	SET_PAR_SHIFT_RIGHT;
-	return CARRY_ON;
-    }
 }
 
-
 /* Check whether current node S[h] is balanced when Decreasing its size by
  * Deleting or Cutting for INTERNAL node of S+tree.
  * Calculate parameters for balancing for current level h.
@@ -1513,157 +1553,173 @@ static int ip_check_balance (struct tree_balance * tb, int h)
  * Note: Items of internal nodes have fixed size, so the balance condition for
  * the internal part of S+tree is as for the B-trees.
  */
-static int dc_check_balance_internal (struct tree_balance * tb, int h)
+static int dc_check_balance_internal(struct tree_balance *tb, int h)
 {
-  struct virtual_node * vn = tb->tb_vn;
+	struct virtual_node *vn = tb->tb_vn;
 
-  /* Sh is the node whose balance is currently being checked,
-     and Fh is its father.  */
-  struct buffer_head * Sh, * Fh;
-  int maxsize,
-      n_ret_value;
-  int lfree, rfree /* free space in L and R */;
+	/* Sh is the node whose balance is currently being checked,
+	   and Fh is its father.  */
+	struct buffer_head *Sh, *Fh;
+	int maxsize, n_ret_value;
+	int lfree, rfree /* free space in L and R */ ;
 
-  Sh = PATH_H_PBUFFER (tb->tb_path, h); 
-  Fh = PATH_H_PPARENT (tb->tb_path, h); 
+	Sh = PATH_H_PBUFFER(tb->tb_path, h);
+	Fh = PATH_H_PPARENT(tb->tb_path, h);
 
-  maxsize = MAX_CHILD_SIZE(Sh); 
+	maxsize = MAX_CHILD_SIZE(Sh);
 
 /*   using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */
 /*   new_nr_item = number of items node would have if operation is */
 /* 	performed without balancing (new_nr_item); */
-  create_virtual_node (tb, h);
+	create_virtual_node(tb, h);
 
-  if ( ! Fh )
-    {   /* S[h] is the root. */
-      if ( vn->vn_nr_item > 0 )
-	{
-	  set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-	  return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */
+	if (!Fh) {		/* S[h] is the root. */
+		if (vn->vn_nr_item > 0) {
+			set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+			return NO_BALANCING_NEEDED;	/* no balancing for higher levels needed */
+		}
+		/* new_nr_item == 0.
+		 * Current root will be deleted resulting in
+		 * decrementing the tree height. */
+		set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
+		return n_ret_value;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	/* determine maximal number of items we can fit into neighbors */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {	/* Balance condition for the internal node is valid.
+						 * In this case we balance only if it leads to better packing. */
+		if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {	/* Here we join S[h] with one of its neighbors,
+							 * which is impossible with greater values of new_nr_item. */
+			if (tb->lnum[h] >= vn->vn_nr_item + 1) {
+				/* All contents of S[h] can be moved to L[h]. */
+				int n;
+				int order_L;
+
+				order_L =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+				n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+
+			if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+				/* All contents of S[h] can be moved to R[h]. */
+				int n;
+				int order_R;
+
+				order_R =
+				    ((n =
+				      PATH_H_B_ITEM_ORDER(tb->tb_path,
+							  h)) ==
+				     B_NR_ITEMS(Fh)) ? 0 : n + 1;
+				n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
+				    (DC_SIZE + KEY_SIZE);
+				set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
+					       -1);
+				return CARRY_ON;
+			}
+		}
+
+		if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+			/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+			int to_r;
+
+			to_r =
+			    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
+			     tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
+			    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
+			set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
+				       0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+		/* Balancing does not lead to better packing. */
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
 	}
-      /* new_nr_item == 0.
-       * Current root will be deleted resulting in
-       * decrementing the tree height. */
-      set_parameters (tb, h, 0, 0, 0, NULL, -1, -1);
-      return CARRY_ON;
-    }
-
-  if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
-    return n_ret_value;
-
-
-  /* get free space of neighbors */
-  rfree = get_rfree (tb, h);
-  lfree = get_lfree (tb, h);
-		
-  /* determine maximal number of items we can fit into neighbors */
-  check_left (tb, h, lfree);
-  check_right (tb, h, rfree);
-
-
-  if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) )
-    { /* Balance condition for the internal node is valid.
-       * In this case we balance only if it leads to better packing. */ 
-      if ( vn->vn_nr_item == MIN_NR_KEY(Sh) )
-	{ /* Here we join S[h] with one of its neighbors,
-	   * which is impossible with greater values of new_nr_item. */
-	  if ( tb->lnum[h] >= vn->vn_nr_item + 1 )
-	    {
-	      /* All contents of S[h] can be moved to L[h]. */
-	      int n;
-	      int order_L;
-	      
-	      order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-	      n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
-	      set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
-	      return CARRY_ON;
-	    }
-
-	  if ( tb->rnum[h] >= vn->vn_nr_item + 1 )
-	    {
-	      /* All contents of S[h] can be moved to R[h]. */
-	      int n;
-	      int order_R;
-	    
-	      order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1;
-	      n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
-	      set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
-	      return CARRY_ON;   
-	    }
+
+	/* Current node contain insufficient number of items. Balancing is required. */
+	/* Check whether we can merge S[h] with left neighbor. */
+	if (tb->lnum[h] >= vn->vn_nr_item + 1)
+		if (is_left_neighbor_in_cache(tb, h)
+		    || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
+			int n;
+			int order_L;
+
+			order_L =
+			    ((n =
+			      PATH_H_B_ITEM_ORDER(tb->tb_path,
+						  h)) ==
+			     0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
+			n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
+								      KEY_SIZE);
+			set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* Check whether we can merge S[h] with right neighbor. */
+	if (tb->rnum[h] >= vn->vn_nr_item + 1) {
+		int n;
+		int order_R;
+
+		order_R =
+		    ((n =
+		      PATH_H_B_ITEM_ORDER(tb->tb_path,
+					  h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
+		n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
+							      KEY_SIZE);
+		set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
+		return CARRY_ON;
 	}
 
-      if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
-	{
-	  /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
-	  int to_r;
+	/* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
+	if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
+		int to_r;
+
+		to_r =
+		    ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
+		     vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
+						tb->rnum[h]);
+		set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
+			       -1, -1);
+		return CARRY_ON;
+	}
 
-	  to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - 
-	    (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
-	  set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
-	  return CARRY_ON;
+	/* For internal nodes try to borrow item from a neighbor */
+	RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
+
+	/* Borrow one or two items from caching neighbor */
+	if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
+		int from_l;
+
+		from_l =
+		    (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
+		     1) / 2 - (vn->vn_nr_item + 1);
+		set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
+		return CARRY_ON;
 	}
 
-      /* Balancing does not lead to better packing. */
-      set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-      return NO_BALANCING_NEEDED;
-    }
-
-  /* Current node contain insufficient number of items. Balancing is required. */	
-  /* Check whether we can merge S[h] with left neighbor. */
-  if (tb->lnum[h] >= vn->vn_nr_item + 1)
-    if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h])
-      {
-	int n;
-	int order_L;
-	      
-	order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
-	n = dc_size(B_N_CHILD(tb->FL[h],order_L)) / (DC_SIZE + KEY_SIZE);
-	set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1);
+	set_parameters(tb, h, 0,
+		       -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
+			  1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
 	return CARRY_ON;
-      }
-
-  /* Check whether we can merge S[h] with right neighbor. */
-  if (tb->rnum[h] >= vn->vn_nr_item + 1)
-    {
-      int n;
-      int order_R;
-	    
-      order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1);
-      n = dc_size(B_N_CHILD(tb->FR[h],order_R)) / (DC_SIZE + KEY_SIZE);
-      set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1);
-      return CARRY_ON;   
-    }
-
-  /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
-  if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)
-    {
-      int to_r;
-	    
-      to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - 
-	(MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
-      set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1);
-      return CARRY_ON;
-    }
-
-  /* For internal nodes try to borrow item from a neighbor */
-  RFALSE( !tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
-
-  /* Borrow one or two items from caching neighbor */
-  if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h])
-    {
-      int from_l;
-		
-      from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 -  (vn->vn_nr_item + 1);
-      set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1);
-      return CARRY_ON;
-    }
-
-  set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1, 
-		  NULL, -1, -1);
-  return CARRY_ON;
 }
 
-
 /* Check whether current node S[h] is balanced when Decreasing its size by
  * Deleting or Truncating for LEAF node of S+tree.
  * Calculate parameters for balancing for current level h.
@@ -1677,90 +1733,86 @@ static int dc_check_balance_internal (struct tree_balance * tb, int h)
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
  */
-static int dc_check_balance_leaf (struct tree_balance * tb, int h)
+static int dc_check_balance_leaf(struct tree_balance *tb, int h)
 {
-  struct virtual_node * vn = tb->tb_vn;
-
-  /* Number of bytes that must be deleted from
-     (value is negative if bytes are deleted) buffer which
-     contains node being balanced.  The mnemonic is that the
-     attempted change in node space used level is levbytes bytes. */
-  int levbytes;
-  /* the maximal item size */
-  int maxsize,
-      n_ret_value;
-  /* S0 is the node whose balance is currently being checked,
-     and F0 is its father.  */
-  struct buffer_head * S0, * F0;
-  int lfree, rfree /* free space in L and R */;
-
-  S0 = PATH_H_PBUFFER (tb->tb_path, 0);
-  F0 = PATH_H_PPARENT (tb->tb_path, 0);
-
-  levbytes = tb->insert_size[h];
-
-  maxsize = MAX_CHILD_SIZE(S0); 	/* maximal possible size of an item */
-
-  if ( ! F0 )
-    {  /* S[0] is the root now. */
-
-      RFALSE( -levbytes >= maxsize - B_FREE_SPACE (S0),
-	      "vs-8240: attempt to create empty buffer tree");
-
-      set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-      return NO_BALANCING_NEEDED;
-    }
-
-  if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON )
-    return n_ret_value;
-
-  /* get free space of neighbors */
-  rfree = get_rfree (tb, h);
-  lfree = get_lfree (tb, h);		
-
-  create_virtual_node (tb, h);
-
-  /* if 3 leaves can be merge to one, set parameters and return */
-  if (are_leaves_removable (tb, lfree, rfree))
-    return CARRY_ON;
-
-  /* determine maximal number of items we can shift to the left/right  neighbor
-     and the maximal number of bytes that can flow to the left/right neighbor
-     from the left/right most liquid item that cannot be shifted from S[0] entirely
-     */
-  check_left (tb, h, lfree);
-  check_right (tb, h, rfree);   
-
-  /* check whether we can merge S with left neighbor. */
-  if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
-    if (is_left_neighbor_in_cache (tb,h) ||
-	((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */
-	!tb->FR[h]) {
-      
-      RFALSE( !tb->FL[h], "vs-8245: dc_check_balance_leaf: FL[h] must exist");
-
-      /* set parameter to merge S[0] with its left neighbor */
-      set_parameters (tb, h, -1, 0, 0, NULL, -1, -1);
-      return CARRY_ON;
-    }
-
-  /* check whether we can merge S[0] with right neighbor. */
-  if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
-    set_parameters (tb, h, 0, -1, 0, NULL, -1, -1);
-    return CARRY_ON;
-  }
-  
-  /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
-  if (is_leaf_removable (tb))
-    return CARRY_ON;
-  
-  /* Balancing is not required. */
-  tb->s0num = vn->vn_nr_item;
-  set_parameters (tb, h, 0, 0, 1, NULL, -1, -1);
-  return NO_BALANCING_NEEDED;
-}
+	struct virtual_node *vn = tb->tb_vn;
+
+	/* Number of bytes that must be deleted from
+	   (value is negative if bytes are deleted) buffer which
+	   contains node being balanced.  The mnemonic is that the
+	   attempted change in node space used level is levbytes bytes. */
+	int levbytes;
+	/* the maximal item size */
+	int maxsize, n_ret_value;
+	/* S0 is the node whose balance is currently being checked,
+	   and F0 is its father.  */
+	struct buffer_head *S0, *F0;
+	int lfree, rfree /* free space in L and R */ ;
+
+	S0 = PATH_H_PBUFFER(tb->tb_path, 0);
+	F0 = PATH_H_PPARENT(tb->tb_path, 0);
 
+	levbytes = tb->insert_size[h];
 
+	maxsize = MAX_CHILD_SIZE(S0);	/* maximal possible size of an item */
+
+	if (!F0) {		/* S[0] is the root now. */
+
+		RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
+		       "vs-8240: attempt to create empty buffer tree");
+
+		set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+		return NO_BALANCING_NEEDED;
+	}
+
+	if ((n_ret_value = get_parents(tb, h)) != CARRY_ON)
+		return n_ret_value;
+
+	/* get free space of neighbors */
+	rfree = get_rfree(tb, h);
+	lfree = get_lfree(tb, h);
+
+	create_virtual_node(tb, h);
+
+	/* if 3 leaves can be merge to one, set parameters and return */
+	if (are_leaves_removable(tb, lfree, rfree))
+		return CARRY_ON;
+
+	/* determine maximal number of items we can shift to the left/right  neighbor
+	   and the maximal number of bytes that can flow to the left/right neighbor
+	   from the left/right most liquid item that cannot be shifted from S[0] entirely
+	 */
+	check_left(tb, h, lfree);
+	check_right(tb, h, rfree);
+
+	/* check whether we can merge S with left neighbor. */
+	if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
+		if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||	/* S can not be merged with R */
+		    !tb->FR[h]) {
+
+			RFALSE(!tb->FL[h],
+			       "vs-8245: dc_check_balance_leaf: FL[h] must exist");
+
+			/* set parameter to merge S[0] with its left neighbor */
+			set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
+			return CARRY_ON;
+		}
+
+	/* check whether we can merge S[0] with right neighbor. */
+	if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
+		set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
+		return CARRY_ON;
+	}
+
+	/* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */
+	if (is_leaf_removable(tb))
+		return CARRY_ON;
+
+	/* Balancing is not required. */
+	tb->s0num = vn->vn_nr_item;
+	set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
+	return NO_BALANCING_NEEDED;
+}
 
 /* Check whether current node S[h] is balanced when Decreasing its size by
  * Deleting or Cutting.
@@ -1775,18 +1827,17 @@ static int dc_check_balance_leaf (struct tree_balance * tb, int h)
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
  */
-static int dc_check_balance (struct tree_balance * tb, int h)
+static int dc_check_balance(struct tree_balance *tb, int h)
 {
- RFALSE( ! (PATH_H_PBUFFER (tb->tb_path, h)), "vs-8250: S is not initialized");
+	RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
+	       "vs-8250: S is not initialized");
 
- if ( h )
-   return dc_check_balance_internal (tb, h);
- else
-   return dc_check_balance_leaf (tb, h);
+	if (h)
+		return dc_check_balance_internal(tb, h);
+	else
+		return dc_check_balance_leaf(tb, h);
 }
 
-
-
 /* Check whether current node S[h] is balanced.
  * Calculate parameters for balancing for current level h.
  * Parameters:
@@ -1805,83 +1856,80 @@ static int dc_check_balance (struct tree_balance * tb, int h)
  *	       -1 - no balancing for higher levels needed;
  *	       -2 - no disk space.
  */
-static int check_balance (int mode, 
-			  struct tree_balance * tb,
-			  int h, 
-			  int inum,
-			  int pos_in_item,
-			  struct item_head * ins_ih,
-			  const void * data
-			  )
+static int check_balance(int mode,
+			 struct tree_balance *tb,
+			 int h,
+			 int inum,
+			 int pos_in_item,
+			 struct item_head *ins_ih, const void *data)
 {
-  struct virtual_node * vn;
+	struct virtual_node *vn;
 
-  vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
-  vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
-  vn->vn_mode = mode;
-  vn->vn_affected_item_num = inum;
-  vn->vn_pos_in_item = pos_in_item;
-  vn->vn_ins_ih = ins_ih;
-  vn->vn_data = data;
+	vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
+	vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
+	vn->vn_mode = mode;
+	vn->vn_affected_item_num = inum;
+	vn->vn_pos_in_item = pos_in_item;
+	vn->vn_ins_ih = ins_ih;
+	vn->vn_data = data;
 
-  RFALSE( mode == M_INSERT && !vn->vn_ins_ih,
-	  "vs-8255: ins_ih can not be 0 in insert mode");
+	RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
+	       "vs-8255: ins_ih can not be 0 in insert mode");
 
- if ( tb->insert_size[h] > 0 )
-   /* Calculate balance parameters when size of node is increasing. */
-   return ip_check_balance (tb, h);
+	if (tb->insert_size[h] > 0)
+		/* Calculate balance parameters when size of node is increasing. */
+		return ip_check_balance(tb, h);
 
- /* Calculate balance parameters when  size of node is decreasing. */
- return dc_check_balance (tb, h);
+	/* Calculate balance parameters when  size of node is decreasing. */
+	return dc_check_balance(tb, h);
 }
 
+/* Check whether parent at the path is the really parent of the current node.*/
+static int get_direct_parent(struct tree_balance *p_s_tb, int n_h)
+{
+	struct buffer_head *p_s_bh;
+	struct path *p_s_path = p_s_tb->tb_path;
+	int n_position,
+	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
+
+	/* We are in the root or in the new root. */
+	if (n_path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
+		       "PAP-8260: invalid offset in the path");
+
+		if (PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->
+		    b_blocknr == SB_ROOT_BLOCK(p_s_tb->tb_sb)) {
+			/* Root is not changed. */
+			PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
+			PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
+			return CARRY_ON;
+		}
+		return REPEAT_SEARCH;	/* Root is changed and we must recalculate the path. */
+	}
+
+	if (!B_IS_IN_TREE
+	    (p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)))
+		return REPEAT_SEARCH;	/* Parent in the path is not in the tree. */
 
+	if ((n_position =
+	     PATH_OFFSET_POSITION(p_s_path,
+				  n_path_offset - 1)) > B_NR_ITEMS(p_s_bh))
+		return REPEAT_SEARCH;
 
-/* Check whether parent at the path is the really parent of the current node.*/
-static int  get_direct_parent(
-              struct tree_balance * p_s_tb,
-              int                   n_h
-            ) {
-    struct buffer_head  * p_s_bh;
-    struct path         * p_s_path      = p_s_tb->tb_path;
-    int                   n_position,
-	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h);
-    
-    /* We are in the root or in the new root. */
-    if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) {
-	
-	RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
-		"PAP-8260: invalid offset in the path");
-
-	if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-	     SB_ROOT_BLOCK (p_s_tb->tb_sb) ) {
-	    /* Root is not changed. */
-	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL;
-	    PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0;
-	    return CARRY_ON;
+	if (B_N_CHILD_NUM(p_s_bh, n_position) !=
+	    PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr)
+		/* Parent in the path is not parent of the current node in the tree. */
+		return REPEAT_SEARCH;
+
+	if (buffer_locked(p_s_bh)) {
+		__wait_on_buffer(p_s_bh);
+		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+			return REPEAT_SEARCH;
 	}
-	return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */
-    }
-
-    if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) )
-	return REPEAT_SEARCH; /* Parent in the path is not in the tree. */
-
-    if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) )
-	return REPEAT_SEARCH;
-    
-    if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr )
-	/* Parent in the path is not parent of the current node in the tree. */
-	return REPEAT_SEARCH;
-
-    if ( buffer_locked(p_s_bh) ) {
-	__wait_on_buffer(p_s_bh);
-	if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
-	    return REPEAT_SEARCH;
-    }
-
-    return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node.  */
-}
 
+	return CARRY_ON;	/* Parent in the path is unlocked and really parent of the current node.  */
+}
 
 /* Using lnum[n_h] and rnum[n_h] we should determine what neighbors
  * of S[n_h] we
@@ -1889,356 +1937,401 @@ static int  get_direct_parent(
  * Returns:	SCHEDULE_OCCURRED - schedule occurred while the function worked;
  *	        CARRY_ON - schedule didn't occur while the function worked;
  */
-static int  get_neighbors(
-	            struct tree_balance * p_s_tb,
-	            int 		  n_h
-	          ) {
-    int		 	n_child_position,
-	n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
-    unsigned long		n_son_number;
-    struct super_block  *	p_s_sb = p_s_tb->tb_sb;
-    struct buffer_head  * p_s_bh;
-
-
-    PROC_INFO_INC( p_s_sb, get_neighbors[ n_h ] );
-
-    if ( p_s_tb->lnum[n_h] ) {
-	/* We need left neighbor to balance S[n_h]. */
-	PROC_INFO_INC( p_s_sb, need_l_neighbor[ n_h ] );
-	p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
-	
-	RFALSE( p_s_bh == p_s_tb->FL[n_h] && 
-		! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
-		"PAP-8270: invalid position in the parent");
-
-	n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]);
-	n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
-	p_s_bh = sb_bread(p_s_sb, n_son_number);
-	if (!p_s_bh)
-	    return IO_ERROR;
-	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
-	    decrement_bcount(p_s_bh);
-	    PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
-	    return REPEAT_SEARCH;
+static int get_neighbors(struct tree_balance *p_s_tb, int n_h)
+{
+	int n_child_position,
+	    n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1);
+	unsigned long n_son_number;
+	struct super_block *p_s_sb = p_s_tb->tb_sb;
+	struct buffer_head *p_s_bh;
+
+	PROC_INFO_INC(p_s_sb, get_neighbors[n_h]);
+
+	if (p_s_tb->lnum[n_h]) {
+		/* We need left neighbor to balance S[n_h]. */
+		PROC_INFO_INC(p_s_sb, need_l_neighbor[n_h]);
+		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+
+		RFALSE(p_s_bh == p_s_tb->FL[n_h] &&
+		       !PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset),
+		       "PAP-8270: invalid position in the parent");
+
+		n_child_position =
+		    (p_s_bh ==
+		     p_s_tb->FL[n_h]) ? p_s_tb->lkey[n_h] : B_NR_ITEMS(p_s_tb->
+								       FL[n_h]);
+		n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position);
+		p_s_bh = sb_bread(p_s_sb, n_son_number);
+		if (!p_s_bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			decrement_bcount(p_s_bh);
+			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+			return REPEAT_SEARCH;
+		}
+
+		RFALSE(!B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
+		       n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
+		       B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
+		       p_s_bh->b_blocknr, "PAP-8275: invalid parent");
+		RFALSE(!B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
+		RFALSE(!n_h &&
+		       B_FREE_SPACE(p_s_bh) !=
+		       MAX_CHILD_SIZE(p_s_bh) -
+		       dc_size(B_N_CHILD(p_s_tb->FL[0], n_child_position)),
+		       "PAP-8290: invalid child size of left neighbor");
+
+		decrement_bcount(p_s_tb->L[n_h]);
+		p_s_tb->L[n_h] = p_s_bh;
 	}
-	
-	RFALSE( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) ||
-                n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) ||
-	        B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) !=
-                p_s_bh->b_blocknr, "PAP-8275: invalid parent");
-	RFALSE( ! B_IS_IN_TREE(p_s_bh), "PAP-8280: invalid child");
-	RFALSE( ! n_h &&
-                B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FL[0],n_child_position)),
-                "PAP-8290: invalid child size of left neighbor");
-
-	decrement_bcount(p_s_tb->L[n_h]);
-	p_s_tb->L[n_h] = p_s_bh;
-    }
-
-
-    if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */
-	PROC_INFO_INC( p_s_sb, need_r_neighbor[ n_h ] );
-	p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
-	
-	RFALSE( p_s_bh == p_s_tb->FR[n_h] && 
-		PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh),
-		"PAP-8295: invalid position in the parent");
-
-	n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0;
-	n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
-	p_s_bh = sb_bread(p_s_sb, n_son_number);
-	if (!p_s_bh)
-	    return IO_ERROR;
-	if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
-	    decrement_bcount(p_s_bh);
-	    PROC_INFO_INC( p_s_sb, get_neighbors_restart[ n_h ] );
-	    return REPEAT_SEARCH;
+
+	if (p_s_tb->rnum[n_h]) {	/* We need right neighbor to balance S[n_path_offset]. */
+		PROC_INFO_INC(p_s_sb, need_r_neighbor[n_h]);
+		p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset);
+
+		RFALSE(p_s_bh == p_s_tb->FR[n_h] &&
+		       PATH_OFFSET_POSITION(p_s_tb->tb_path,
+					    n_path_offset) >=
+		       B_NR_ITEMS(p_s_bh),
+		       "PAP-8295: invalid position in the parent");
+
+		n_child_position =
+		    (p_s_bh == p_s_tb->FR[n_h]) ? p_s_tb->rkey[n_h] + 1 : 0;
+		n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position);
+		p_s_bh = sb_bread(p_s_sb, n_son_number);
+		if (!p_s_bh)
+			return IO_ERROR;
+		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			decrement_bcount(p_s_bh);
+			PROC_INFO_INC(p_s_sb, get_neighbors_restart[n_h]);
+			return REPEAT_SEARCH;
+		}
+		decrement_bcount(p_s_tb->R[n_h]);
+		p_s_tb->R[n_h] = p_s_bh;
+
+		RFALSE(!n_h
+		       && B_FREE_SPACE(p_s_bh) !=
+		       MAX_CHILD_SIZE(p_s_bh) -
+		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)),
+		       "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
+		       B_FREE_SPACE(p_s_bh), MAX_CHILD_SIZE(p_s_bh),
+		       dc_size(B_N_CHILD(p_s_tb->FR[0], n_child_position)));
+
 	}
-	decrement_bcount(p_s_tb->R[n_h]);
-	p_s_tb->R[n_h] = p_s_bh;
-
-	RFALSE( ! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)),
-                "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
-                B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh),
-                dc_size(B_N_CHILD (p_s_tb->FR[0],n_child_position)));
-	
-    }
-    return CARRY_ON;
+	return CARRY_ON;
 }
 
 #ifdef CONFIG_REISERFS_CHECK
-void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s)
+void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s)
 {
-    void * vp;
-    static size_t malloced;
-
-
-    vp = kmalloc (size, flags);
-    if (vp) {
-	REISERFS_SB(s)->s_kmallocs += size;
-	if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
-	    reiserfs_warning (s,
-			      "vs-8301: reiserfs_kmalloc: allocated memory %d",
-			      REISERFS_SB(s)->s_kmallocs);
-	    malloced = REISERFS_SB(s)->s_kmallocs;
+	void *vp;
+	static size_t malloced;
+
+	vp = kmalloc(size, flags);
+	if (vp) {
+		REISERFS_SB(s)->s_kmallocs += size;
+		if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) {
+			reiserfs_warning(s,
+					 "vs-8301: reiserfs_kmalloc: allocated memory %d",
+					 REISERFS_SB(s)->s_kmallocs);
+			malloced = REISERFS_SB(s)->s_kmallocs;
+		}
 	}
-    }
-    return vp;
+	return vp;
 }
 
-void reiserfs_kfree (const void * vp, size_t size, struct super_block * s)
+void reiserfs_kfree(const void *vp, size_t size, struct super_block *s)
 {
-    kfree (vp);
-  
-    REISERFS_SB(s)->s_kmallocs -= size;
-    if (REISERFS_SB(s)->s_kmallocs < 0)
-	reiserfs_warning (s, "vs-8302: reiserfs_kfree: allocated memory %d",
-			  REISERFS_SB(s)->s_kmallocs);
+	kfree(vp);
+
+	REISERFS_SB(s)->s_kmallocs -= size;
+	if (REISERFS_SB(s)->s_kmallocs < 0)
+		reiserfs_warning(s,
+				 "vs-8302: reiserfs_kfree: allocated memory %d",
+				 REISERFS_SB(s)->s_kmallocs);
 
 }
 #endif
 
-
-static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh)
+static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
 {
-    int max_num_of_items;
-    int max_num_of_entries;
-    unsigned long blocksize = sb->s_blocksize;
+	int max_num_of_items;
+	int max_num_of_entries;
+	unsigned long blocksize = sb->s_blocksize;
 
 #define MIN_NAME_LEN 1
 
-    max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
-    max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / 
-                         (DEH_SIZE + MIN_NAME_LEN);
+	max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
+	max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
+	    (DEH_SIZE + MIN_NAME_LEN);
 
-    return sizeof(struct virtual_node) + 
-           max(max_num_of_items * sizeof (struct virtual_item),
-	       sizeof (struct virtual_item) + sizeof(struct direntry_uarea) + 
-               (max_num_of_entries - 1) * sizeof (__u16));
+	return sizeof(struct virtual_node) +
+	    max(max_num_of_items * sizeof(struct virtual_item),
+		sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
+		(max_num_of_entries - 1) * sizeof(__u16));
 }
 
-
-
 /* maybe we should fail balancing we are going to perform when kmalloc
    fails several times. But now it will loop until kmalloc gets
    required memory */
-static int get_mem_for_virtual_node (struct tree_balance * tb)
+static int get_mem_for_virtual_node(struct tree_balance *tb)
 {
-    int check_fs = 0;
-    int size;
-    char * buf;
-
-    size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path));
-
-    if (size > tb->vn_buf_size) {
-	/* we have to allocate more memory for virtual node */
-	if (tb->vn_buf) {
-	    /* free memory allocated before */
-	    reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
-	    /* this is not needed if kfree is atomic */
-            check_fs = 1;
-	}
+	int check_fs = 0;
+	int size;
+	char *buf;
+
+	size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
+
+	if (size > tb->vn_buf_size) {
+		/* we have to allocate more memory for virtual node */
+		if (tb->vn_buf) {
+			/* free memory allocated before */
+			reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+			/* this is not needed if kfree is atomic */
+			check_fs = 1;
+		}
 
-	/* virtual node requires now more memory */
-	tb->vn_buf_size = size;
-
-	/* get memory for virtual item */
-	buf = reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, tb->tb_sb);
-	if ( ! buf ) {
-	    /* getting memory with GFP_KERNEL priority may involve
-               balancing now (due to indirect_to_direct conversion on
-               dcache shrinking). So, release path and collected
-               resources here */
-	    free_buffers_in_tb (tb);
-	    buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
-	    if ( !buf ) {
+		/* virtual node requires now more memory */
+		tb->vn_buf_size = size;
+
+		/* get memory for virtual item */
+		buf =
+		    reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN,
+				     tb->tb_sb);
+		if (!buf) {
+			/* getting memory with GFP_KERNEL priority may involve
+			   balancing now (due to indirect_to_direct conversion on
+			   dcache shrinking). So, release path and collected
+			   resources here */
+			free_buffers_in_tb(tb);
+			buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb);
+			if (!buf) {
 #ifdef CONFIG_REISERFS_CHECK
-		reiserfs_warning (tb->tb_sb,
-				  "vs-8345: get_mem_for_virtual_node: "
-				  "kmalloc failed. reiserfs kmalloced %d bytes",
-				  REISERFS_SB(tb->tb_sb)->s_kmallocs);
+				reiserfs_warning(tb->tb_sb,
+						 "vs-8345: get_mem_for_virtual_node: "
+						 "kmalloc failed. reiserfs kmalloced %d bytes",
+						 REISERFS_SB(tb->tb_sb)->
+						 s_kmallocs);
 #endif
-		tb->vn_buf_size = 0;
-	    }
-	    tb->vn_buf = buf;
-	    schedule() ;
-	    return REPEAT_SEARCH;
-	}
+				tb->vn_buf_size = 0;
+			}
+			tb->vn_buf = buf;
+			schedule();
+			return REPEAT_SEARCH;
+		}
 
-	tb->vn_buf = buf;
-    }
+		tb->vn_buf = buf;
+	}
 
-    if ( check_fs && FILESYSTEM_CHANGED_TB (tb) )
-        return REPEAT_SEARCH;
+	if (check_fs && FILESYSTEM_CHANGED_TB(tb))
+		return REPEAT_SEARCH;
 
-    return CARRY_ON;
+	return CARRY_ON;
 }
 
-
 #ifdef CONFIG_REISERFS_CHECK
-static void tb_buffer_sanity_check (struct super_block * p_s_sb,
-				    struct buffer_head * p_s_bh, 
-				    const char *descr, int level) {
-  if (p_s_bh) {
-    if (atomic_read (&(p_s_bh->b_count)) <= 0) {
-
-      reiserfs_panic (p_s_sb, "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-
-    if ( ! buffer_uptodate (p_s_bh) ) {
-      reiserfs_panic (p_s_sb, "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-
-    if ( ! B_IS_IN_TREE (p_s_bh) ) {
-      reiserfs_panic (p_s_sb, "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-
-    if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
-	reiserfs_panic (p_s_sb, "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-
-    if (p_s_bh->b_size != p_s_sb->s_blocksize) {
-	reiserfs_panic (p_s_sb, "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-
-    if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
-	reiserfs_panic (p_s_sb, "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n", descr, level, p_s_bh);
-    }
-  }
-}
-#else
-static void tb_buffer_sanity_check (struct super_block * p_s_sb,
-				    struct buffer_head * p_s_bh, 
-				    const char *descr, int level)
-{;}
-#endif
-
-static int clear_all_dirty_bits(struct super_block *s,
-                                 struct buffer_head *bh) {
-  return reiserfs_prepare_for_journal(s, bh, 0) ;
-}
-
-static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
+static void tb_buffer_sanity_check(struct super_block *p_s_sb,
+				   struct buffer_head *p_s_bh,
+				   const char *descr, int level)
 {
-    struct buffer_head * locked;
-#ifdef CONFIG_REISERFS_CHECK
-    int repeat_counter = 0;
-#endif
-    int i;
+	if (p_s_bh) {
+		if (atomic_read(&(p_s_bh->b_count)) <= 0) {
 
-    do {
-
-	locked = NULL;
-
-	for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) {
-	    if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) {
-		/* if I understand correctly, we can only be sure the last buffer
-		** in the path is in the tree --clm
-		*/
-#ifdef CONFIG_REISERFS_CHECK
-		if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
-		    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, 
-					    PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i), 
-					    "S", 
-					    p_s_tb->tb_path->path_length - i);
+			reiserfs_panic(p_s_sb,
+				       "jmacd-1: tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
 		}
-#endif
-		if (!clear_all_dirty_bits(p_s_tb->tb_sb,
-				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
-		{
-		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
-		}
-	    }
-	}
 
-	for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) { 
+		if (!buffer_uptodate(p_s_bh)) {
+			reiserfs_panic(p_s_sb,
+				       "jmacd-2: tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
+		}
 
-	    if (p_s_tb->lnum[i] ) {
+		if (!B_IS_IN_TREE(p_s_bh)) {
+			reiserfs_panic(p_s_sb,
+				       "jmacd-3: tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
+		}
 
-		if ( p_s_tb->L[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
-			locked = p_s_tb->L[i];
+		if (p_s_bh->b_bdev != p_s_sb->s_bdev) {
+			reiserfs_panic(p_s_sb,
+				       "jmacd-4: tb_buffer_sanity_check(): buffer has wrong device %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
 		}
 
-		if ( !locked && p_s_tb->FL[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
-			locked = p_s_tb->FL[i];
+		if (p_s_bh->b_size != p_s_sb->s_blocksize) {
+			reiserfs_panic(p_s_sb,
+				       "jmacd-5: tb_buffer_sanity_check(): buffer has wrong blocksize %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
 		}
 
-		if ( !locked && p_s_tb->CFL[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
-			locked = p_s_tb->CFL[i];
+		if (p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
+			reiserfs_panic(p_s_sb,
+				       "jmacd-6: tb_buffer_sanity_check(): buffer block number too high %s[%d] (%b)\n",
+				       descr, level, p_s_bh);
 		}
+	}
+}
+#else
+static void tb_buffer_sanity_check(struct super_block *p_s_sb,
+				   struct buffer_head *p_s_bh,
+				   const char *descr, int level)
+{;
+}
+#endif
 
-	    }
+static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
+{
+	return reiserfs_prepare_for_journal(s, bh, 0);
+}
 
-	    if ( !locked && (p_s_tb->rnum[i]) ) {
+static int wait_tb_buffers_until_unlocked(struct tree_balance *p_s_tb)
+{
+	struct buffer_head *locked;
+#ifdef CONFIG_REISERFS_CHECK
+	int repeat_counter = 0;
+#endif
+	int i;
 
-		if ( p_s_tb->R[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
-			locked = p_s_tb->R[i];
-		}
+	do {
 
-       
-		if ( !locked && p_s_tb->FR[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
-			locked = p_s_tb->FR[i];
+		locked = NULL;
+
+		for (i = p_s_tb->tb_path->path_length;
+		     !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
+			if (PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
+				/* if I understand correctly, we can only be sure the last buffer
+				 ** in the path is in the tree --clm
+				 */
+#ifdef CONFIG_REISERFS_CHECK
+				if (PATH_PLAST_BUFFER(p_s_tb->tb_path) ==
+				    PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       PATH_OFFSET_PBUFFER
+							       (p_s_tb->tb_path,
+								i), "S",
+							       p_s_tb->tb_path->
+							       path_length - i);
+				}
+#endif
+				if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+							  PATH_OFFSET_PBUFFER
+							  (p_s_tb->tb_path,
+							   i))) {
+					locked =
+					    PATH_OFFSET_PBUFFER(p_s_tb->tb_path,
+								i);
+				}
+			}
 		}
 
-		if ( !locked && p_s_tb->CFR[i] ) {
-		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
-		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
-			locked = p_s_tb->CFR[i];
+		for (i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i];
+		     i++) {
+
+			if (p_s_tb->lnum[i]) {
+
+				if (p_s_tb->L[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->L[i],
+							       "L", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->L[i]))
+						locked = p_s_tb->L[i];
+				}
+
+				if (!locked && p_s_tb->FL[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->FL[i],
+							       "FL", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->FL[i]))
+						locked = p_s_tb->FL[i];
+				}
+
+				if (!locked && p_s_tb->CFL[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->CFL[i],
+							       "CFL", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->CFL[i]))
+						locked = p_s_tb->CFL[i];
+				}
+
+			}
+
+			if (!locked && (p_s_tb->rnum[i])) {
+
+				if (p_s_tb->R[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->R[i],
+							       "R", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->R[i]))
+						locked = p_s_tb->R[i];
+				}
+
+				if (!locked && p_s_tb->FR[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->FR[i],
+							       "FR", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->FR[i]))
+						locked = p_s_tb->FR[i];
+				}
+
+				if (!locked && p_s_tb->CFR[i]) {
+					tb_buffer_sanity_check(p_s_tb->tb_sb,
+							       p_s_tb->CFR[i],
+							       "CFR", i);
+					if (!clear_all_dirty_bits
+					    (p_s_tb->tb_sb, p_s_tb->CFR[i]))
+						locked = p_s_tb->CFR[i];
+				}
+			}
+		}
+		/* as far as I can tell, this is not required.  The FEB list seems
+		 ** to be full of newly allocated nodes, which will never be locked,
+		 ** dirty, or anything else.
+		 ** To be safe, I'm putting in the checks and waits in.  For the moment,
+		 ** they are needed to keep the code in journal.c from complaining
+		 ** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
+		 ** --clm
+		 */
+		for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
+			if (p_s_tb->FEB[i]) {
+				if (!clear_all_dirty_bits
+				    (p_s_tb->tb_sb, p_s_tb->FEB[i]))
+					locked = p_s_tb->FEB[i];
+			}
 		}
-	    }
-	}
-	/* as far as I can tell, this is not required.  The FEB list seems
-	** to be full of newly allocated nodes, which will never be locked,
-	** dirty, or anything else.
-	** To be safe, I'm putting in the checks and waits in.  For the moment,
-	** they are needed to keep the code in journal.c from complaining
-	** about the buffer.  That code is inside CONFIG_REISERFS_CHECK as well.
-	** --clm
-	*/
-	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { 
-	    if ( p_s_tb->FEB[i] ) {
-		if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
-		    locked = p_s_tb->FEB[i] ;
-	    }
-	}
 
-	if (locked) {
+		if (locked) {
 #ifdef CONFIG_REISERFS_CHECK
-	    repeat_counter++;
-	    if ( (repeat_counter % 10000) == 0) {
-		reiserfs_warning (p_s_tb->tb_sb,
-				  "wait_tb_buffers_until_released(): too many "
-				  "iterations waiting for buffer to unlock "
-				  "(%b)", locked);
-
-		/* Don't loop forever.  Try to recover from possible error. */
-
-		return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON;
-	    }
+			repeat_counter++;
+			if ((repeat_counter % 10000) == 0) {
+				reiserfs_warning(p_s_tb->tb_sb,
+						 "wait_tb_buffers_until_released(): too many "
+						 "iterations waiting for buffer to unlock "
+						 "(%b)", locked);
+
+				/* Don't loop forever.  Try to recover from possible error. */
+
+				return (FILESYSTEM_CHANGED_TB(p_s_tb)) ?
+				    REPEAT_SEARCH : CARRY_ON;
+			}
 #endif
-	    __wait_on_buffer (locked);
-	    if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) {
-		return REPEAT_SEARCH;
-	    }
-	}
+			__wait_on_buffer(locked);
+			if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+				return REPEAT_SEARCH;
+			}
+		}
 
-    } while (locked);
+	} while (locked);
 
-    return CARRY_ON;
+	return CARRY_ON;
 }
 
-
 /* Prepare for balancing, that is
  *	get all necessary parents, and neighbors;
  *	analyze what and where should be moved;
@@ -2267,252 +2360,266 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
  *             -1 - if no_disk_space 
  */
 
+int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb, struct item_head *p_s_ins_ih,	// item head of item being inserted
+	      const void *data	// inserted item or data to be pasted
+    )
+{
+	int n_ret_value, n_h, n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
+	int n_pos_in_item;
 
-int fix_nodes (int n_op_mode,
-	       struct tree_balance * 	p_s_tb,
-	       struct item_head * p_s_ins_ih, // item head of item being inserted
-	       const void * data // inserted item or data to be pasted
-    ) {
-    int	n_ret_value,
-    	n_h,
-    	n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path);
-    int n_pos_in_item;
-
-    /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
-    ** during wait_tb_buffers_run
-    */
-    int wait_tb_buffers_run = 0 ; 
-    struct buffer_head  * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
-
-    ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
-
-    n_pos_in_item = p_s_tb->tb_path->pos_in_item;
-
-
-    p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb);
-
-    /* we prepare and log the super here so it will already be in the
-    ** transaction when do_balance needs to change it.
-    ** This way do_balance won't have to schedule when trying to prepare
-    ** the super for logging
-    */
-    reiserfs_prepare_for_journal(p_s_tb->tb_sb, 
-                                 SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ;
-    journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, 
-                       SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ;
-    if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
-	return REPEAT_SEARCH;
-
-    /* if it possible in indirect_to_direct conversion */
-    if (buffer_locked (p_s_tbS0)) {
-        __wait_on_buffer (p_s_tbS0);
-        if ( FILESYSTEM_CHANGED_TB (p_s_tb) )
-            return REPEAT_SEARCH;
-    }
+	/* we set wait_tb_buffers_run when we have to restore any dirty bits cleared
+	 ** during wait_tb_buffers_run
+	 */
+	int wait_tb_buffers_run = 0;
+	struct buffer_head *p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
 
-#ifdef CONFIG_REISERFS_CHECK
-    if ( cur_tb ) {
-	print_cur_tb ("fix_nodes");
-	reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes:  there is pending do_balance");
-    }
-
-    if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) {
-	reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
-			"at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode);
-    }
-
-    /* Check parameters. */
-    switch (n_op_mode) {
-    case M_INSERT:
-	if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) )
-	    reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
-			   n_item_num, B_NR_ITEMS(p_s_tbS0));
-	break;
-    case M_PASTE:
-    case M_DELETE:
-    case M_CUT:
-	if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) {
-	    print_block (p_s_tbS0, 0, -1, -1);
-	    reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n", n_item_num, n_op_mode, p_s_tb->insert_size[0]);
-	}
-	break;
-    default:
-	reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation");
-    }
-#endif
+	++REISERFS_SB(p_s_tb->tb_sb)->s_fix_nodes;
+
+	n_pos_in_item = p_s_tb->tb_path->pos_in_item;
+
+	p_s_tb->fs_gen = get_generation(p_s_tb->tb_sb);
 
-    if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH)
-	// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
-	return REPEAT_SEARCH;
+	/* we prepare and log the super here so it will already be in the
+	 ** transaction when do_balance needs to change it.
+	 ** This way do_balance won't have to schedule when trying to prepare
+	 ** the super for logging
+	 */
+	reiserfs_prepare_for_journal(p_s_tb->tb_sb,
+				     SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1);
+	journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb,
+			   SB_BUFFER_WITH_SB(p_s_tb->tb_sb));
+	if (FILESYSTEM_CHANGED_TB(p_s_tb))
+		return REPEAT_SEARCH;
 
+	/* if it possible in indirect_to_direct conversion */
+	if (buffer_locked(p_s_tbS0)) {
+		__wait_on_buffer(p_s_tbS0);
+		if (FILESYSTEM_CHANGED_TB(p_s_tb))
+			return REPEAT_SEARCH;
+	}
+#ifdef CONFIG_REISERFS_CHECK
+	if (cur_tb) {
+		print_cur_tb("fix_nodes");
+		reiserfs_panic(p_s_tb->tb_sb,
+			       "PAP-8305: fix_nodes:  there is pending do_balance");
+	}
 
-    /* Starting from the leaf level; for all levels n_h of the tree. */
-    for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) { 
-	if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) {
-	    goto repeat;
+	if (!buffer_uptodate(p_s_tbS0) || !B_IS_IN_TREE(p_s_tbS0)) {
+		reiserfs_panic(p_s_tb->tb_sb,
+			       "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate "
+			       "at the beginning of fix_nodes or not in tree (mode %c)",
+			       p_s_tbS0, p_s_tbS0, n_op_mode);
 	}
 
-	if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num,
-					   n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) {
-	    if ( n_ret_value == NO_BALANCING_NEEDED ) {
-		/* No balancing for higher levels needed. */
-		if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
-		    goto repeat;
+	/* Check parameters. */
+	switch (n_op_mode) {
+	case M_INSERT:
+		if (n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0))
+			reiserfs_panic(p_s_tb->tb_sb,
+				       "PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert",
+				       n_item_num, B_NR_ITEMS(p_s_tbS0));
+		break;
+	case M_PASTE:
+	case M_DELETE:
+	case M_CUT:
+		if (n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0)) {
+			print_block(p_s_tbS0, 0, -1, -1);
+			reiserfs_panic(p_s_tb->tb_sb,
+				       "PAP-8335: fix_nodes: Incorrect item number(%d); mode = %c insert_size = %d\n",
+				       n_item_num, n_op_mode,
+				       p_s_tb->insert_size[0]);
 		}
-		if ( n_h != MAX_HEIGHT - 1 )  
-		    p_s_tb->insert_size[n_h + 1] = 0;
-		/* ok, analysis and resource gathering are complete */
 		break;
-	    }
-	    goto repeat;
+	default:
+		reiserfs_panic(p_s_tb->tb_sb,
+			       "PAP-8340: fix_nodes: Incorrect mode of operation");
 	}
+#endif
 
-	if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) {
-	    goto repeat;
-	}
+	if (get_mem_for_virtual_node(p_s_tb) == REPEAT_SEARCH)
+		// FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat
+		return REPEAT_SEARCH;
 
-	if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) {
-	    goto repeat;        /* No disk space, or schedule occurred and
-				   analysis may be invalid and needs to be redone. */
-	}
-    
-	if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) {
-	    /* We have a positive insert size but no nodes exist on this
-	       level, this means that we are creating a new root. */
+	/* Starting from the leaf level; for all levels n_h of the tree. */
+	for (n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++) {
+		if ((n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON) {
+			goto repeat;
+		}
 
-	    RFALSE( p_s_tb->blknum[n_h] != 1,
-		    "PAP-8350: creating new empty root");
+		if ((n_ret_value =
+		     check_balance(n_op_mode, p_s_tb, n_h, n_item_num,
+				   n_pos_in_item, p_s_ins_ih,
+				   data)) != CARRY_ON) {
+			if (n_ret_value == NO_BALANCING_NEEDED) {
+				/* No balancing for higher levels needed. */
+				if ((n_ret_value =
+				     get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+					goto repeat;
+				}
+				if (n_h != MAX_HEIGHT - 1)
+					p_s_tb->insert_size[n_h + 1] = 0;
+				/* ok, analysis and resource gathering are complete */
+				break;
+			}
+			goto repeat;
+		}
 
-	    if ( n_h < MAX_HEIGHT - 1 )
-		p_s_tb->insert_size[n_h + 1] = 0;
-	}
-	else
-	    if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) {
-		if ( p_s_tb->blknum[n_h] > 1 ) {
-		    /* The tree needs to be grown, so this node S[n_h]
-		       which is the root node is split into two nodes,
-		       and a new node (S[n_h+1]) will be created to
-		       become the root node.  */
-	  
-		    RFALSE( n_h == MAX_HEIGHT - 1,
-			    "PAP-8355: attempt to create too high of a tree");
-
-		    p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE;
+		if ((n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON) {
+			goto repeat;
 		}
-		else
-		    if ( n_h < MAX_HEIGHT - 1 )
-			p_s_tb->insert_size[n_h + 1] = 0;
-	    }
-	    else
-		p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
-    }
-
-    if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
-	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
-	    wait_tb_buffers_run = 1 ;
-	    n_ret_value = REPEAT_SEARCH ;
-	    goto repeat; 
-	} else {
-	    return CARRY_ON;
+
+		if ((n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON) {
+			goto repeat;	/* No disk space, or schedule occurred and
+					   analysis may be invalid and needs to be redone. */
+		}
+
+		if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h)) {
+			/* We have a positive insert size but no nodes exist on this
+			   level, this means that we are creating a new root. */
+
+			RFALSE(p_s_tb->blknum[n_h] != 1,
+			       "PAP-8350: creating new empty root");
+
+			if (n_h < MAX_HEIGHT - 1)
+				p_s_tb->insert_size[n_h + 1] = 0;
+		} else if (!PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1)) {
+			if (p_s_tb->blknum[n_h] > 1) {
+				/* The tree needs to be grown, so this node S[n_h]
+				   which is the root node is split into two nodes,
+				   and a new node (S[n_h+1]) will be created to
+				   become the root node.  */
+
+				RFALSE(n_h == MAX_HEIGHT - 1,
+				       "PAP-8355: attempt to create too high of a tree");
+
+				p_s_tb->insert_size[n_h + 1] =
+				    (DC_SIZE +
+				     KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) +
+				    DC_SIZE;
+			} else if (n_h < MAX_HEIGHT - 1)
+				p_s_tb->insert_size[n_h + 1] = 0;
+		} else
+			p_s_tb->insert_size[n_h + 1] =
+			    (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
 	}
-    } else {
-	wait_tb_buffers_run = 1 ;
-	goto repeat; 
-    }
-
- repeat:
-    // fix_nodes was unable to perform its calculation due to
-    // filesystem got changed under us, lack of free disk space or i/o
-    // failure. If the first is the case - the search will be
-    // repeated. For now - free all resources acquired so far except
-    // for the new allocated nodes
-    {
-	int i;
 
-	/* Release path buffers. */
-	if (wait_tb_buffers_run) {
-	    pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ;
+	if ((n_ret_value = wait_tb_buffers_until_unlocked(p_s_tb)) == CARRY_ON) {
+		if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
+			wait_tb_buffers_run = 1;
+			n_ret_value = REPEAT_SEARCH;
+			goto repeat;
+		} else {
+			return CARRY_ON;
+		}
 	} else {
-	    pathrelse (p_s_tb->tb_path);
-        }	
-	/* brelse all resources collected for balancing */
-	for ( i = 0; i < MAX_HEIGHT; i++ ) {
-	    if (wait_tb_buffers_run) {
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]);
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]);
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]);
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]);
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]);
-		reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]);
-	    }
-
-	    brelse (p_s_tb->L[i]);p_s_tb->L[i] = NULL;
-	    brelse (p_s_tb->R[i]);p_s_tb->R[i] = NULL;
-	    brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = NULL;
-	    brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = NULL;
-	    brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = NULL;
-	    brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = NULL;
+		wait_tb_buffers_run = 1;
+		goto repeat;
 	}
 
-	if (wait_tb_buffers_run) {
-	    for ( i = 0; i < MAX_FEB_SIZE; i++ ) { 
-		if ( p_s_tb->FEB[i] ) {
-		    reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, 
-						     p_s_tb->FEB[i]) ;
+      repeat:
+	// fix_nodes was unable to perform its calculation due to
+	// filesystem got changed under us, lack of free disk space or i/o
+	// failure. If the first is the case - the search will be
+	// repeated. For now - free all resources acquired so far except
+	// for the new allocated nodes
+	{
+		int i;
+
+		/* Release path buffers. */
+		if (wait_tb_buffers_run) {
+			pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path);
+		} else {
+			pathrelse(p_s_tb->tb_path);
+		}
+		/* brelse all resources collected for balancing */
+		for (i = 0; i < MAX_HEIGHT; i++) {
+			if (wait_tb_buffers_run) {
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->L[i]);
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->R[i]);
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->FL[i]);
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->FR[i]);
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->
+								 CFL[i]);
+				reiserfs_restore_prepared_buffer(p_s_tb->tb_sb,
+								 p_s_tb->
+								 CFR[i]);
+			}
+
+			brelse(p_s_tb->L[i]);
+			p_s_tb->L[i] = NULL;
+			brelse(p_s_tb->R[i]);
+			p_s_tb->R[i] = NULL;
+			brelse(p_s_tb->FL[i]);
+			p_s_tb->FL[i] = NULL;
+			brelse(p_s_tb->FR[i]);
+			p_s_tb->FR[i] = NULL;
+			brelse(p_s_tb->CFL[i]);
+			p_s_tb->CFL[i] = NULL;
+			brelse(p_s_tb->CFR[i]);
+			p_s_tb->CFR[i] = NULL;
+		}
+
+		if (wait_tb_buffers_run) {
+			for (i = 0; i < MAX_FEB_SIZE; i++) {
+				if (p_s_tb->FEB[i]) {
+					reiserfs_restore_prepared_buffer
+					    (p_s_tb->tb_sb, p_s_tb->FEB[i]);
+				}
+			}
 		}
-	    }
+		return n_ret_value;
 	}
-	return n_ret_value;
-    }
 
 }
 
-
 /* Anatoly will probably forgive me renaming p_s_tb to tb. I just
    wanted to make lines shorter */
-void unfix_nodes (struct tree_balance * tb)
+void unfix_nodes(struct tree_balance *tb)
 {
-    int	i;
-
-    /* Release path buffers. */
-    pathrelse_and_restore (tb->tb_sb, tb->tb_path);
-
-    /* brelse all resources collected for balancing */
-    for ( i = 0; i < MAX_HEIGHT; i++ ) {
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]);
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]);
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]);
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]);
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]);
-	reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]);
-
-	brelse (tb->L[i]);
-	brelse (tb->R[i]);
-	brelse (tb->FL[i]);
-	brelse (tb->FR[i]);
-	brelse (tb->CFL[i]);
-	brelse (tb->CFR[i]);
-    }
-
-    /* deal with list of allocated (used and unused) nodes */
-    for ( i = 0; i < MAX_FEB_SIZE; i++ ) {
-	if ( tb->FEB[i] ) {
-	    b_blocknr_t blocknr  = tb->FEB[i]->b_blocknr ;
-	    /* de-allocated block which was not used by balancing and
-               bforget about buffer for it */
-	    brelse (tb->FEB[i]);
-	    reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0);
-	}
-	if (tb->used[i]) {
-	    /* release used as new nodes including a new root */
-	    brelse (tb->used[i]);
-	}
-    }
+	int i;
 
-    if (tb->vn_buf) 
-    reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
+	/* Release path buffers. */
+	pathrelse_and_restore(tb->tb_sb, tb->tb_path);
 
-} 
+	/* brelse all resources collected for balancing */
+	for (i = 0; i < MAX_HEIGHT; i++) {
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
+		reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
+
+		brelse(tb->L[i]);
+		brelse(tb->R[i]);
+		brelse(tb->FL[i]);
+		brelse(tb->FR[i]);
+		brelse(tb->CFL[i]);
+		brelse(tb->CFR[i]);
+	}
 
+	/* deal with list of allocated (used and unused) nodes */
+	for (i = 0; i < MAX_FEB_SIZE; i++) {
+		if (tb->FEB[i]) {
+			b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
+			/* de-allocated block which was not used by balancing and
+			   bforget about buffer for it */
+			brelse(tb->FEB[i]);
+			reiserfs_free_block(tb->transaction_handle, NULL,
+					    blocknr, 0);
+		}
+		if (tb->used[i]) {
+			/* release used as new nodes including a new root */
+			brelse(tb->used[i]);
+		}
+	}
 
+	if (tb->vn_buf)
+		reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb);
 
+}
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index 08d0508c2d39..37c1306eb9b7 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -22,7 +22,6 @@
 #include <asm/types.h>
 #include <asm/bug.h>
 
-
 #define DELTA 0x9E3779B9
 #define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
 #define PARTROUNDS 6		/* 6 gets complete mixing */
@@ -48,105 +47,75 @@
 		h1 += b1;						\
 	} while(0)
 
-
 u32 keyed_hash(const signed char *msg, int len)
 {
-	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3}; 
+	u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3 };
 
 	u32 h0 = k[0], h1 = k[1];
 	u32 a, b, c, d;
 	u32 pad;
 	int i;
- 
-	//	assert(len >= 0 && len < 256);
 
-	pad = (u32)len | ((u32)len << 8);
+	//      assert(len >= 0 && len < 256);
+
+	pad = (u32) len | ((u32) len << 8);
 	pad |= pad << 16;
 
-	while(len >= 16)
-	{
-		a = (u32)msg[ 0]      |
-		    (u32)msg[ 1] << 8 |
-		    (u32)msg[ 2] << 16|
-		    (u32)msg[ 3] << 24;
-		b = (u32)msg[ 4]      |
-		    (u32)msg[ 5] << 8 |
-		    (u32)msg[ 6] << 16|
-		    (u32)msg[ 7] << 24;
-		c = (u32)msg[ 8]      |
-		    (u32)msg[ 9] << 8 |
-		    (u32)msg[10] << 16|
-		    (u32)msg[11] << 24;
-		d = (u32)msg[12]      |
-		    (u32)msg[13] << 8 |
-		    (u32)msg[14] << 16|
-		    (u32)msg[15] << 24;
-		
+	while (len >= 16) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
+		d = (u32) msg[12] |
+		    (u32) msg[13] << 8 |
+		    (u32) msg[14] << 16 | (u32) msg[15] << 24;
+
 		TEACORE(PARTROUNDS);
 
 		len -= 16;
 		msg += 16;
 	}
 
-	if (len >= 12)
-	{
-		a = (u32)msg[ 0]      |
-		    (u32)msg[ 1] << 8 |
-		    (u32)msg[ 2] << 16|
-		    (u32)msg[ 3] << 24;
-		b = (u32)msg[ 4]      |
-		    (u32)msg[ 5] << 8 |
-		    (u32)msg[ 6] << 16|
-		    (u32)msg[ 7] << 24;
-		c = (u32)msg[ 8]      |
-		    (u32)msg[ 9] << 8 |
-		    (u32)msg[10] << 16|
-		    (u32)msg[11] << 24;
+	if (len >= 12) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
+		c = (u32) msg[8] |
+		    (u32) msg[9] << 8 |
+		    (u32) msg[10] << 16 | (u32) msg[11] << 24;
 
 		d = pad;
-		for(i = 12; i < len; i++)
-		{
+		for (i = 12; i < len; i++) {
 			d <<= 8;
 			d |= msg[i];
 		}
-	}
-	else if (len >= 8)
-	{
-		a = (u32)msg[ 0]      |
-		    (u32)msg[ 1] << 8 |
-		    (u32)msg[ 2] << 16|
-		    (u32)msg[ 3] << 24;
-		b = (u32)msg[ 4]      |
-		    (u32)msg[ 5] << 8 |
-		    (u32)msg[ 6] << 16|
-		    (u32)msg[ 7] << 24;
+	} else if (len >= 8) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
+		b = (u32) msg[4] |
+		    (u32) msg[5] << 8 | (u32) msg[6] << 16 | (u32) msg[7] << 24;
 
 		c = d = pad;
-		for(i = 8; i < len; i++)
-		{
+		for (i = 8; i < len; i++) {
 			c <<= 8;
 			c |= msg[i];
 		}
-	}
-	else if (len >= 4)
-	{
-		a = (u32)msg[ 0]      |
-		    (u32)msg[ 1] << 8 |
-		    (u32)msg[ 2] << 16|
-		    (u32)msg[ 3] << 24;
+	} else if (len >= 4) {
+		a = (u32) msg[0] |
+		    (u32) msg[1] << 8 | (u32) msg[2] << 16 | (u32) msg[3] << 24;
 
 		b = c = d = pad;
-		for(i = 4; i < len; i++)
-		{
+		for (i = 4; i < len; i++) {
 			b <<= 8;
 			b |= msg[i];
 		}
-	}
-	else
-	{
+	} else {
 		a = b = c = d = pad;
-		for(i = 0; i < len; i++)
-		{
+		for (i = 0; i < len; i++) {
 			a <<= 8;
 			a |= msg[i];
 		}
@@ -155,55 +124,59 @@ u32 keyed_hash(const signed char *msg, int len)
 	TEACORE(FULLROUNDS);
 
 /*	return 0;*/
-	return h0^h1;
+	return h0 ^ h1;
 }
 
 /* What follows in this file is copyright 2000 by Hans Reiser, and the
  * licensing of what follows is governed by reiserfs/README */
 
-u32 yura_hash (const signed char *msg, int len)
+u32 yura_hash(const signed char *msg, int len)
 {
-    int j, pow;
-    u32 a, c;
-    int i;
-    
-    for (pow=1,i=1; i < len; i++) pow = pow * 10; 
-    
-    if (len == 1) 
-	a = msg[0]-48;
-    else
-	a = (msg[0] - 48) * pow;
-    
-    for (i=1; i < len; i++) {
-	c = msg[i] - 48; 
-	for (pow=1,j=i; j < len-1; j++) pow = pow * 10; 
-	a = a + c * pow;
-    }
-    
-    for (; i < 40; i++) {
-	c = '0' - 48; 
-	for (pow=1,j=i; j < len-1; j++) pow = pow * 10; 
-	a = a + c * pow;
-    }
-    
-    for (; i < 256; i++) {
-	c = i; 
-	for (pow=1,j=i; j < len-1; j++) pow = pow * 10; 
-	a = a + c * pow;
-    }
-    
-    a = a << 7;
-    return a;
+	int j, pow;
+	u32 a, c;
+	int i;
+
+	for (pow = 1, i = 1; i < len; i++)
+		pow = pow * 10;
+
+	if (len == 1)
+		a = msg[0] - 48;
+	else
+		a = (msg[0] - 48) * pow;
+
+	for (i = 1; i < len; i++) {
+		c = msg[i] - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 40; i++) {
+		c = '0' - 48;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	for (; i < 256; i++) {
+		c = i;
+		for (pow = 1, j = i; j < len - 1; j++)
+			pow = pow * 10;
+		a = a + c * pow;
+	}
+
+	a = a << 7;
+	return a;
 }
 
-u32 r5_hash (const signed char *msg, int len)
+u32 r5_hash(const signed char *msg, int len)
 {
-  u32 a=0;
-  while(*msg) { 
-    a += *msg << 4;
-    a += *msg >> 4;
-    a *= 11;
-    msg++;
-   } 
-  return a;
+	u32 a = 0;
+	while (*msg) {
+		a += *msg << 4;
+		a += *msg >> 4;
+		a *= 11;
+		msg++;
+	}
+	return a;
 }
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index a362125da0d8..6c5a726fd34b 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -10,13 +10,8 @@
 #include <linux/buffer_head.h>
 
 /* this is one and only function that is used outside (do_balance.c) */
-int	balance_internal (
-			  struct tree_balance * ,
-			  int,
-			  int,
-			  struct item_head * ,
-			  struct buffer_head ** 
-			  );
+int balance_internal(struct tree_balance *,
+		     int, int, struct item_head *, struct buffer_head **);
 
 /* modes of internal_shift_left, internal_shift_right and internal_insert_childs */
 #define INTERNAL_SHIFT_FROM_S_TO_L 0
@@ -27,464 +22,474 @@ int	balance_internal (
 #define INTERNAL_INSERT_TO_L 5
 #define INTERNAL_INSERT_TO_R 6
 
-static void	internal_define_dest_src_infos (
-						int shift_mode,
-						struct tree_balance * tb,
-						int h,
-						struct buffer_info * dest_bi,
-						struct buffer_info * src_bi,
-						int * d_key,
-						struct buffer_head ** cf
-						)
+static void internal_define_dest_src_infos(int shift_mode,
+					   struct tree_balance *tb,
+					   int h,
+					   struct buffer_info *dest_bi,
+					   struct buffer_info *src_bi,
+					   int *d_key, struct buffer_head **cf)
 {
-    memset (dest_bi, 0, sizeof (struct buffer_info));
-    memset (src_bi, 0, sizeof (struct buffer_info));
-    /* define dest, src, dest parent, dest position */
-    switch (shift_mode) {
-    case INTERNAL_SHIFT_FROM_S_TO_L:	/* used in internal_shift_left */
-	src_bi->tb = tb;
-	src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
-	src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->L[h];
-	dest_bi->bi_parent = tb->FL[h];
-	dest_bi->bi_position = get_left_neighbor_position (tb, h);
-	*d_key = tb->lkey[h];
-	*cf = tb->CFL[h];
-	break;
-    case INTERNAL_SHIFT_FROM_L_TO_S:
-	src_bi->tb = tb;
-	src_bi->bi_bh = tb->L[h];
-	src_bi->bi_parent = tb->FL[h];
-	src_bi->bi_position = get_left_neighbor_position (tb, h);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
-	dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */
-	*d_key = tb->lkey[h];
-	*cf = tb->CFL[h];
-	break;
-      
-    case INTERNAL_SHIFT_FROM_R_TO_S:	/* used in internal_shift_left */
-	src_bi->tb = tb;
-	src_bi->bi_bh = tb->R[h];
-	src_bi->bi_parent = tb->FR[h];
-	src_bi->bi_position = get_right_neighbor_position (tb, h);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
-	dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-	*d_key = tb->rkey[h];
-	*cf = tb->CFR[h];
-	break;
-
-    case INTERNAL_SHIFT_FROM_S_TO_R:
-	src_bi->tb = tb;
-	src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
-	src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->R[h];
-	dest_bi->bi_parent = tb->FR[h];
-	dest_bi->bi_position = get_right_neighbor_position (tb, h);
-	*d_key = tb->rkey[h];
-	*cf = tb->CFR[h];
-	break;
-
-    case INTERNAL_INSERT_TO_L:
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->L[h];
-	dest_bi->bi_parent = tb->FL[h];
-	dest_bi->bi_position = get_left_neighbor_position (tb, h);
-	break;
-	
-    case INTERNAL_INSERT_TO_S:
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h);
-	dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-	break;
-
-    case INTERNAL_INSERT_TO_R:
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->R[h];
-	dest_bi->bi_parent = tb->FR[h];
-	dest_bi->bi_position = get_right_neighbor_position (tb, h);
-	break;
-
-    default:
-	reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
-    }
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+	case INTERNAL_SHIFT_FROM_S_TO_L:	/* used in internal_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+	case INTERNAL_SHIFT_FROM_L_TO_S:
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[h];
+		src_bi->bi_parent = tb->FL[h];
+		src_bi->bi_position = get_left_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);	/* dest position is analog of dest->b_item_order */
+		*d_key = tb->lkey[h];
+		*cf = tb->CFL[h];
+		break;
+
+	case INTERNAL_SHIFT_FROM_R_TO_S:	/* used in internal_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[h];
+		src_bi->bi_parent = tb->FR[h];
+		src_bi->bi_position = get_right_neighbor_position(tb, h);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_SHIFT_FROM_S_TO_R:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		*d_key = tb->rkey[h];
+		*cf = tb->CFR[h];
+		break;
+
+	case INTERNAL_INSERT_TO_L:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[h];
+		dest_bi->bi_parent = tb->FL[h];
+		dest_bi->bi_position = get_left_neighbor_position(tb, h);
+		break;
+
+	case INTERNAL_INSERT_TO_S:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = PATH_H_PBUFFER(tb->tb_path, h);
+		dest_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		dest_bi->bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		break;
+
+	case INTERNAL_INSERT_TO_R:
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[h];
+		dest_bi->bi_parent = tb->FR[h];
+		dest_bi->bi_position = get_right_neighbor_position(tb, h);
+		break;
+
+	default:
+		reiserfs_panic(tb->tb_sb,
+			       "internal_define_dest_src_infos: shift type is unknown (%d)",
+			       shift_mode);
+	}
 }
 
-
-
 /* Insert count node pointers into buffer cur before position to + 1.
  * Insert count items into buffer cur before position to.
  * Items and node pointers are specified by inserted and bh respectively.
- */ 
-static void internal_insert_childs (struct buffer_info * cur_bi,
-				    int to, int count,
-				    struct item_head * inserted,
-				    struct buffer_head ** bh
-    )
+ */
+static void internal_insert_childs(struct buffer_info *cur_bi,
+				   int to, int count,
+				   struct item_head *inserted,
+				   struct buffer_head **bh)
 {
-    struct buffer_head * cur = cur_bi->bi_bh;
-    struct block_head * blkh;
-    int nr;
-    struct reiserfs_key * ih;
-    struct disk_child new_dc[2];
-    struct disk_child * dc;
-    int i;
-
-    if (count <= 0)
-	return;
-
-    blkh = B_BLK_HEAD(cur);
-    nr = blkh_nr_item(blkh);
-
-    RFALSE( count > 2,
-	    "too many children (%d) are to be inserted", count);
-    RFALSE( B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE),
-	    "no enough free space (%d), needed %d bytes", 
-	    B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE));
-
-    /* prepare space for count disk_child */
-    dc = B_N_CHILD(cur,to+1);
-
-    memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE);
-
-    /* copy to_be_insert disk children */
-    for (i = 0; i < count; i ++) {
-	put_dc_size( &(new_dc[i]), MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
-	put_dc_block_number( &(new_dc[i]), bh[i]->b_blocknr );
-    }
-    memcpy (dc, new_dc, DC_SIZE * count);
-
-  
-    /* prepare space for count items  */
-    ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to));
-
-    memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
-
-    /* copy item headers (keys) */
-    memcpy (ih, inserted, KEY_SIZE);
-    if ( count > 1 )
-	memcpy (ih + 1, inserted + 1, KEY_SIZE);
-
-    /* sizes, item number */
-    set_blkh_nr_item( blkh, blkh_nr_item(blkh) + count );
-    set_blkh_free_space( blkh,
-                        blkh_free_space(blkh) - count * (DC_SIZE + KEY_SIZE ) );
-
-    do_balance_mark_internal_dirty (cur_bi->tb, cur,0);
-
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-    check_internal (cur);
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-
-    if (cur_bi->bi_parent) {
-	struct disk_child *t_dc = B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position);
-	put_dc_size( t_dc, dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
-	do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&&*/
-	check_internal (cur_bi->bi_parent);
-	/*&&&&&&&&&&&&&&&&&&&&&&&&*/   
-    }
+	struct buffer_head *cur = cur_bi->bi_bh;
+	struct block_head *blkh;
+	int nr;
+	struct reiserfs_key *ih;
+	struct disk_child new_dc[2];
+	struct disk_child *dc;
+	int i;
+
+	if (count <= 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	RFALSE(count > 2, "too many children (%d) are to be inserted", count);
+	RFALSE(B_FREE_SPACE(cur) < count * (KEY_SIZE + DC_SIZE),
+	       "no enough free space (%d), needed %d bytes",
+	       B_FREE_SPACE(cur), count * (KEY_SIZE + DC_SIZE));
+
+	/* prepare space for count disk_child */
+	dc = B_N_CHILD(cur, to + 1);
+
+	memmove(dc + count, dc, (nr + 1 - (to + 1)) * DC_SIZE);
+
+	/* copy to_be_insert disk children */
+	for (i = 0; i < count; i++) {
+		put_dc_size(&(new_dc[i]),
+			    MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE(bh[i]));
+		put_dc_block_number(&(new_dc[i]), bh[i]->b_blocknr);
+	}
+	memcpy(dc, new_dc, DC_SIZE * count);
+
+	/* prepare space for count items  */
+	ih = B_N_PDELIM_KEY(cur, ((to == -1) ? 0 : to));
+
+	memmove(ih + count, ih,
+		(nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE);
+
+	/* copy item headers (keys) */
+	memcpy(ih, inserted, KEY_SIZE);
+	if (count > 1)
+		memcpy(ih + 1, inserted + 1, KEY_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + count);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - count * (DC_SIZE +
+							     KEY_SIZE));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (count * (DC_SIZE + KEY_SIZE)));
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
 
 }
 
-
 /* Delete del_num items and node pointers from buffer cur starting from *
  * the first_i'th item and first_p'th pointers respectively.		*/
-static void	internal_delete_pointers_items (
-						struct buffer_info * cur_bi,
-						int first_p, 
-						int first_i, 
-						int del_num
-						)
+static void internal_delete_pointers_items(struct buffer_info *cur_bi,
+					   int first_p,
+					   int first_i, int del_num)
 {
-  struct buffer_head * cur = cur_bi->bi_bh;
-  int nr;
-  struct block_head * blkh;
-  struct reiserfs_key * key;
-  struct disk_child * dc;
-
-  RFALSE( cur == NULL, "buffer is 0");
-  RFALSE( del_num < 0,
-          "negative number of items (%d) can not be deleted", del_num);
-  RFALSE( first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0,
-          "first pointer order (%d) < 0 or "
-          "no so many pointers (%d), only (%d) or "
-          "first key order %d < 0", first_p, 
-          first_p + del_num, B_NR_ITEMS (cur) + 1, first_i);
-  if ( del_num == 0 )
-    return;
-
-  blkh = B_BLK_HEAD(cur);
-  nr = blkh_nr_item(blkh);
-
-  if ( first_p == 0 && del_num == nr + 1 ) {
-    RFALSE( first_i != 0, "1st deleted key must have order 0, not %d", first_i);
-    make_empty_node (cur_bi);
-    return;
-  }
-
-  RFALSE( first_i + del_num > B_NR_ITEMS (cur),
-          "first_i = %d del_num = %d "
-          "no so many keys (%d) in the node (%b)(%z)",
-          first_i, del_num, first_i + del_num, cur, cur);
-
-
-  /* deleting */
-  dc = B_N_CHILD (cur, first_p);
-
-  memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
-  key = B_N_PDELIM_KEY (cur, first_i);
-  memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE);
-
-
-  /* sizes, item number */
-  set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
-  set_blkh_free_space( blkh,
-                    blkh_free_space(blkh) + (del_num * (KEY_SIZE + DC_SIZE) ) );
-
-  do_balance_mark_internal_dirty (cur_bi->tb, cur, 0);
-  /*&&&&&&&&&&&&&&&&&&&&&&&*/
-  check_internal (cur);
-  /*&&&&&&&&&&&&&&&&&&&&&&&*/
- 
-  if (cur_bi->bi_parent) {
-    struct disk_child *t_dc;
-    t_dc = B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position);
-    put_dc_size( t_dc, dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE) ) );
-
-    do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0);
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-    check_internal (cur_bi->bi_parent);
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/   
-  }
-}
+	struct buffer_head *cur = cur_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	RFALSE(cur == NULL, "buffer is 0");
+	RFALSE(del_num < 0,
+	       "negative number of items (%d) can not be deleted", del_num);
+	RFALSE(first_p < 0 || first_p + del_num > B_NR_ITEMS(cur) + 1
+	       || first_i < 0,
+	       "first pointer order (%d) < 0 or "
+	       "no so many pointers (%d), only (%d) or "
+	       "first key order %d < 0", first_p, first_p + del_num,
+	       B_NR_ITEMS(cur) + 1, first_i);
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(cur);
+	nr = blkh_nr_item(blkh);
+
+	if (first_p == 0 && del_num == nr + 1) {
+		RFALSE(first_i != 0,
+		       "1st deleted key must have order 0, not %d", first_i);
+		make_empty_node(cur_bi);
+		return;
+	}
 
+	RFALSE(first_i + del_num > B_NR_ITEMS(cur),
+	       "first_i = %d del_num = %d "
+	       "no so many keys (%d) in the node (%b)(%z)",
+	       first_i, del_num, first_i + del_num, cur, cur);
+
+	/* deleting */
+	dc = B_N_CHILD(cur, first_p);
+
+	memmove(dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE);
+	key = B_N_PDELIM_KEY(cur, first_i);
+	memmove(key, key + del_num,
+		(nr - first_i - del_num) * KEY_SIZE + (nr + 1 -
+						       del_num) * DC_SIZE);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) +
+			    (del_num * (KEY_SIZE + DC_SIZE)));
+
+	do_balance_mark_internal_dirty(cur_bi->tb, cur, 0);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(cur);
+	/*&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (cur_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(cur_bi->bi_parent, cur_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (del_num * (KEY_SIZE + DC_SIZE)));
+
+		do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(cur_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
+}
 
 /* delete n node pointers and items starting from given position */
-static void  internal_delete_childs (struct buffer_info * cur_bi, 
-				     int from, int n)
+static void internal_delete_childs(struct buffer_info *cur_bi, int from, int n)
 {
-  int i_from;
+	int i_from;
 
-  i_from = (from == 0) ? from : from - 1;
+	i_from = (from == 0) ? from : from - 1;
 
-  /* delete n pointers starting from `from' position in CUR;
-     delete n keys starting from 'i_from' position in CUR;
-     */
-  internal_delete_pointers_items (cur_bi, from, i_from, n);
+	/* delete n pointers starting from `from' position in CUR;
+	   delete n keys starting from 'i_from' position in CUR;
+	 */
+	internal_delete_pointers_items(cur_bi, from, i_from, n);
 }
 
-
 /* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest
 * last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest
  * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest 
  */
-static void internal_copy_pointers_items (
-					  struct buffer_info * dest_bi,
-					  struct buffer_head * src,
-					  int last_first, int cpy_num
-					  )
+static void internal_copy_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_head *src,
+					 int last_first, int cpy_num)
 {
-  /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
-   * as delimiting key have already inserted to buffer dest.*/
-  struct buffer_head * dest = dest_bi->bi_bh;
-  int nr_dest, nr_src;
-  int dest_order, src_order;
-  struct block_head * blkh;
-  struct reiserfs_key * key;
-  struct disk_child * dc;
-
-  nr_src = B_NR_ITEMS (src);
-
-  RFALSE( dest == NULL || src == NULL, 
-	  "src (%p) or dest (%p) buffer is 0", src, dest);
-  RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	  "invalid last_first parameter (%d)", last_first);
-  RFALSE( nr_src < cpy_num - 1, 
-	  "no so many items (%d) in src (%d)", cpy_num, nr_src);
-  RFALSE( cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
-  RFALSE( cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
-	  "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
-	  cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
-
-  if ( cpy_num == 0 )
-    return;
+	/* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST *
+	 * as delimiting key have already inserted to buffer dest.*/
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr_dest, nr_src;
+	int dest_order, src_order;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+
+	nr_src = B_NR_ITEMS(src);
+
+	RFALSE(dest == NULL || src == NULL,
+	       "src (%p) or dest (%p) buffer is 0", src, dest);
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "invalid last_first parameter (%d)", last_first);
+	RFALSE(nr_src < cpy_num - 1,
+	       "no so many items (%d) in src (%d)", cpy_num, nr_src);
+	RFALSE(cpy_num < 0, "cpy_num less than 0 (%d)", cpy_num);
+	RFALSE(cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest),
+	       "cpy_num (%d) + item number in dest (%d) can not be > MAX_NR_KEY(%d)",
+	       cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest));
+
+	if (cpy_num == 0)
+		return;
 
 	/* coping */
-  blkh = B_BLK_HEAD(dest);
-  nr_dest = blkh_nr_item(blkh);
+	blkh = B_BLK_HEAD(dest);
+	nr_dest = blkh_nr_item(blkh);
 
-  /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/
-  /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/
-  (last_first == LAST_TO_FIRST) ?	(dest_order = 0, src_order = nr_src - cpy_num + 1) :
-    (dest_order = nr_dest, src_order = 0);
+	/*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest; */
+	/*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0; */
+	(last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order =
+					 nr_src - cpy_num + 1) : (dest_order =
+								  nr_dest,
+								  src_order =
+								  0);
 
-  /* prepare space for cpy_num pointers */
-  dc = B_N_CHILD (dest, dest_order);
+	/* prepare space for cpy_num pointers */
+	dc = B_N_CHILD(dest, dest_order);
 
-  memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
+	memmove(dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE);
 
 	/* insert pointers */
-  memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num);
-
-
-  /* prepare space for cpy_num - 1 item headers */
-  key = B_N_PDELIM_KEY(dest, dest_order);
-  memmove (key + cpy_num - 1, key,
-	   KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num));
-
-
-  /* insert headers */
-  memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1));
-
-  /* sizes, item number */
-  set_blkh_nr_item( blkh, blkh_nr_item(blkh) + (cpy_num - 1 ) );
-  set_blkh_free_space( blkh,
-      blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num ) );
-
-  do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
-
-  /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-  check_internal (dest);
-  /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-
-  if (dest_bi->bi_parent) {
-    struct disk_child *t_dc;
-    t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
-    put_dc_size( t_dc, dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num) );
-
-    do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/
-    check_internal (dest_bi->bi_parent);
-    /*&&&&&&&&&&&&&&&&&&&&&&&&*/   
-  }
+	memcpy(dc, B_N_CHILD(src, src_order), DC_SIZE * cpy_num);
+
+	/* prepare space for cpy_num - 1 item headers */
+	key = B_N_PDELIM_KEY(dest, dest_order);
+	memmove(key + cpy_num - 1, key,
+		KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest +
+							       cpy_num));
+
+	/* insert headers */
+	memcpy(key, B_N_PDELIM_KEY(src, src_order), KEY_SIZE * (cpy_num - 1));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + (cpy_num - 1));
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) - (KEY_SIZE * (cpy_num - 1) +
+						     DC_SIZE * cpy_num));
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	check_internal(dest);
+	/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (KEY_SIZE * (cpy_num - 1) +
+					     DC_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(dest_bi->bi_parent);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+	}
 
 }
 
-
 /* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest.
  * Delete cpy_num - del_par items and node pointers from buffer src.
  * last_first == FIRST_TO_LAST means, that we copy/delete first items from src.
  * last_first == LAST_TO_FIRST means, that we copy/delete last items from src.
  */
-static void internal_move_pointers_items (struct buffer_info * dest_bi, 
-					  struct buffer_info * src_bi, 
-					  int last_first, int cpy_num, int del_par)
+static void internal_move_pointers_items(struct buffer_info *dest_bi,
+					 struct buffer_info *src_bi,
+					 int last_first, int cpy_num,
+					 int del_par)
 {
-    int first_pointer;
-    int first_item;
-    
-    internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num);
-
-    if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
-	first_pointer = 0;
-	first_item = 0;
-	/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, 
-	   for key - with first_item */
-	internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par);
-    } else {			/* shift_right occurs */
-	int i, j;
-
-	i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par;
-
-	internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par);
-    }
+	int first_pointer;
+	int first_item;
+
+	internal_copy_pointers_items(dest_bi, src_bi->bi_bh, last_first,
+				     cpy_num);
+
+	if (last_first == FIRST_TO_LAST) {	/* shift_left occurs */
+		first_pointer = 0;
+		first_item = 0;
+		/* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, 
+		   for key - with first_item */
+		internal_delete_pointers_items(src_bi, first_pointer,
+					       first_item, cpy_num - del_par);
+	} else {		/* shift_right occurs */
+		int i, j;
+
+		i = (cpy_num - del_par ==
+		     (j =
+		      B_NR_ITEMS(src_bi->bi_bh)) + 1) ? 0 : j - cpy_num +
+		    del_par;
+
+		internal_delete_pointers_items(src_bi,
+					       j + 1 - cpy_num + del_par, i,
+					       cpy_num - del_par);
+	}
 }
 
 /* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */
-static void internal_insert_key (struct buffer_info * dest_bi, 
-				 int dest_position_before,                 /* insert key before key with n_dest number */
-				 struct buffer_head * src, 
-				 int src_position)
+static void internal_insert_key(struct buffer_info *dest_bi, int dest_position_before,	/* insert key before key with n_dest number */
+				struct buffer_head *src, int src_position)
 {
-    struct buffer_head * dest = dest_bi->bi_bh;
-    int nr;
-    struct block_head * blkh;
-    struct reiserfs_key * key;
-
-    RFALSE( dest == NULL || src == NULL,
-	    "source(%p) or dest(%p) buffer is 0", src, dest);
-    RFALSE( dest_position_before < 0 || src_position < 0,
-	    "source(%d) or dest(%d) key number less than 0", 
-	    src_position, dest_position_before);
-    RFALSE( dest_position_before > B_NR_ITEMS (dest) || 
-	    src_position >= B_NR_ITEMS(src),
-	    "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
-	    dest_position_before, B_NR_ITEMS (dest), 
-	    src_position, B_NR_ITEMS(src));
-    RFALSE( B_FREE_SPACE (dest) < KEY_SIZE,
-	    "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest));
-
-    blkh = B_BLK_HEAD(dest);
-    nr = blkh_nr_item(blkh);
-
-    /* prepare space for inserting key */
-    key = B_N_PDELIM_KEY (dest, dest_position_before);
-    memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
-
-    /* insert key */
-    memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
-
-    /* Change dirt, free space, item number fields. */
-
-    set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
-    set_blkh_free_space( blkh, blkh_free_space(blkh) - KEY_SIZE );
-
-    do_balance_mark_internal_dirty (dest_bi->tb, dest, 0);
-
-    if (dest_bi->bi_parent) {
-	struct disk_child *t_dc;
-	t_dc = B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position);
-	put_dc_size( t_dc, dc_size(t_dc) + KEY_SIZE );
-
-	do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0);
-    }
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int nr;
+	struct block_head *blkh;
+	struct reiserfs_key *key;
+
+	RFALSE(dest == NULL || src == NULL,
+	       "source(%p) or dest(%p) buffer is 0", src, dest);
+	RFALSE(dest_position_before < 0 || src_position < 0,
+	       "source(%d) or dest(%d) key number less than 0",
+	       src_position, dest_position_before);
+	RFALSE(dest_position_before > B_NR_ITEMS(dest) ||
+	       src_position >= B_NR_ITEMS(src),
+	       "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))",
+	       dest_position_before, B_NR_ITEMS(dest),
+	       src_position, B_NR_ITEMS(src));
+	RFALSE(B_FREE_SPACE(dest) < KEY_SIZE,
+	       "no enough free space (%d) in dest buffer", B_FREE_SPACE(dest));
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+
+	/* prepare space for inserting key */
+	key = B_N_PDELIM_KEY(dest, dest_position_before);
+	memmove(key + 1, key,
+		(nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE);
+
+	/* insert key */
+	memcpy(key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE);
+
+	/* Change dirt, free space, item number fields. */
+
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh, blkh_free_space(blkh) - KEY_SIZE);
+
+	do_balance_mark_internal_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + KEY_SIZE);
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
 }
 
-
-
 /* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. 
  * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfl.
  * Delete pointer_amount items and node pointers from buffer src.
  */
 /* this can be invoked both to shift from S to L and from R to S */
-static void	internal_shift_left (
-				     int mode,	/* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
-				     struct tree_balance * tb,
-				     int h,
-				     int pointer_amount
-				     )
+static void internal_shift_left(int mode,	/* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */
+				struct tree_balance *tb,
+				int h, int pointer_amount)
 {
-  struct buffer_info dest_bi, src_bi;
-  struct buffer_head * cf;
-  int d_key_position;
-
-  internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
-
-  /*printk("pointer_amount = %d\n",pointer_amount);*/
-
-  if (pointer_amount) {
-    /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
-    internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
-
-    if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
-      if (src_bi.bi_position/*src->b_item_order*/ == 0)
-	replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0);
-    } else
-      replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1);
-  }
-  /* last parameter is del_parameter */
-  internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0);
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	/*printk("pointer_amount = %d\n",pointer_amount); */
+
+	if (pointer_amount) {
+		/* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+
+		if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) {
+			if (src_bi.bi_position /*src->b_item_order */  == 0)
+				replace_key(tb, cf, d_key_position,
+					    src_bi.
+					    bi_parent /*src->b_parent */ , 0);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    pointer_amount - 1);
+	}
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 0);
 
 }
 
@@ -493,67 +498,66 @@ static void	internal_shift_left (
  * Delete n - 1 items and node pointers from buffer S[h].
  */
 /* it always shifts from S[h] to L[h] */
-static void	internal_shift1_left (
-				      struct tree_balance * tb, 
-				      int h, 
-				      int pointer_amount
-				      )
+static void internal_shift1_left(struct tree_balance *tb,
+				 int h, int pointer_amount)
 {
-  struct buffer_info dest_bi, src_bi;
-  struct buffer_head * cf;
-  int d_key_position;
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
 
-  internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
 
-  if ( pointer_amount > 0 ) /* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
-    internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position);
-  /*		internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/
+	if (pointer_amount > 0)	/* insert lkey[h]-th key  from CFL[h] to left neighbor L[h] */
+		internal_insert_key(&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf,
+				    d_key_position);
+	/*            internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]); */
 
-  /* last parameter is del_parameter */
-  internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1);
-  /*	internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, FIRST_TO_LAST,
+				     pointer_amount, 1);
+	/*    internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1); */
 }
 
-
 /* Insert d_key'th (delimiting) key from buffer cfr to head of dest. 
  * Copy n node pointers and n - 1 items from buffer src to buffer dest.
  * Replace  d_key'th key in buffer cfr.
  * Delete n items and node pointers from buffer src.
  */
-static void internal_shift_right (
-				  int mode,	/* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
-				  struct tree_balance * tb,
-				  int h,
-				  int pointer_amount
-				  )
+static void internal_shift_right(int mode,	/* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */
+				 struct tree_balance *tb,
+				 int h, int pointer_amount)
 {
-  struct buffer_info dest_bi, src_bi;
-  struct buffer_head * cf;
-  int d_key_position;
-  int nr;
-
-
-  internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
-
-  nr = B_NR_ITEMS (src_bi.bi_bh);
-
-  if (pointer_amount > 0) {
-    /* insert delimiting key from common father of dest and src to dest node into position 0 */
-    internal_insert_key (&dest_bi, 0, cf, d_key_position);
-    if (nr == pointer_amount - 1) {
-	 RFALSE( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ || 
-		 dest_bi.bi_bh != tb->R[h],
-		 "src (%p) must be == tb->S[h](%p) when it disappears",
-		 src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h));
-      /* when S[h] disappers replace left delemiting key as well */
-      if (tb->CFL[h])
-	replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]);
-    } else
-      replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount);
-  }      
-
-  /* last parameter is del_parameter */
-  internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0);
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+	int nr;
+
+	internal_define_dest_src_infos(mode, tb, h, &dest_bi, &src_bi,
+				       &d_key_position, &cf);
+
+	nr = B_NR_ITEMS(src_bi.bi_bh);
+
+	if (pointer_amount > 0) {
+		/* insert delimiting key from common father of dest and src to dest node into position 0 */
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+		if (nr == pointer_amount - 1) {
+			RFALSE(src_bi.bi_bh != PATH_H_PBUFFER(tb->tb_path, h) /*tb->S[h] */ ||
+			       dest_bi.bi_bh != tb->R[h],
+			       "src (%p) must be == tb->S[h](%p) when it disappears",
+			       src_bi.bi_bh, PATH_H_PBUFFER(tb->tb_path, h));
+			/* when S[h] disappers replace left delemiting key as well */
+			if (tb->CFL[h])
+				replace_key(tb, cf, d_key_position, tb->CFL[h],
+					    tb->lkey[h]);
+		} else
+			replace_key(tb, cf, d_key_position, src_bi.bi_bh,
+				    nr - pointer_amount);
+	}
+
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 0);
 }
 
 /* Insert delimiting key to R[h].
@@ -561,498 +565,526 @@ static void internal_shift_right (
  * Delete n - 1 items and node pointers from buffer S[h].
  */
 /* it always shift from S[h] to R[h] */
-static void	internal_shift1_right (
-				       struct tree_balance * tb, 
-				       int h, 
-				       int pointer_amount
-				       )
+static void internal_shift1_right(struct tree_balance *tb,
+				  int h, int pointer_amount)
 {
-  struct buffer_info dest_bi, src_bi;
-  struct buffer_head * cf;
-  int d_key_position;
-
-  internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf);
-
-  if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */
-    internal_insert_key (&dest_bi, 0, cf, d_key_position);
-  /*		internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/
-	
-  /* last parameter is del_parameter */
-  internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1);
-  /*	internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/
-}
+	struct buffer_info dest_bi, src_bi;
+	struct buffer_head *cf;
+	int d_key_position;
+
+	internal_define_dest_src_infos(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				       &dest_bi, &src_bi, &d_key_position, &cf);
+
+	if (pointer_amount > 0)	/* insert rkey from CFR[h] to right neighbor R[h] */
+		internal_insert_key(&dest_bi, 0, cf, d_key_position);
+	/*            internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]); */
 
+	/* last parameter is del_parameter */
+	internal_move_pointers_items(&dest_bi, &src_bi, LAST_TO_FIRST,
+				     pointer_amount, 1);
+	/*    internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1); */
+}
 
 /* Delete insert_num node pointers together with their left items
  * and balance current node.*/
-static void balance_internal_when_delete (struct tree_balance * tb, 
-					  int h, int child_pos)
+static void balance_internal_when_delete(struct tree_balance *tb,
+					 int h, int child_pos)
 {
-    int insert_num;
-    int n;
-    struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
-    struct buffer_info bi;
-
-    insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
-  
-    /* delete child-node-pointer(s) together with their left item(s) */
-    bi.tb = tb;
-    bi.bi_bh = tbSh;
-    bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-    bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-
-    internal_delete_childs (&bi, child_pos, -insert_num);
-
-    RFALSE( tb->blknum[h] > 1,
-	    "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
-
-    n = B_NR_ITEMS(tbSh);
-
-    if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) {
-	if ( tb->blknum[h] == 0 ) {
-	    /* node S[h] (root of the tree) is empty now */
-	    struct buffer_head *new_root;
-
-	    RFALSE( n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE,
-		    "buffer must have only 0 keys (%d)", n);
-	    RFALSE( bi.bi_parent, "root has parent (%p)", bi.bi_parent);
-		
-	    /* choose a new root */
-	    if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) )
-		new_root = tb->R[h-1];
-	    else
-		new_root = tb->L[h-1];
-	    /* switch super block's tree root block number to the new value */
-            PUT_SB_ROOT_BLOCK( tb->tb_sb, new_root->b_blocknr );
-	    //REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
-            PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) - 1 );
-
-	    do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    if (h > 1)
-		/* use check_internal if new root is an internal node */
-		check_internal (new_root);
-	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-
-	    /* do what is needed for buffer thrown from tree */
-	    reiserfs_invalidate_buffer(tb, tbSh);
-	    return;
+	int insert_num;
+	int n;
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+
+	insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE));
+
+	/* delete child-node-pointer(s) together with their left item(s) */
+	bi.tb = tb;
+	bi.bi_bh = tbSh;
+	bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+	bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+	internal_delete_childs(&bi, child_pos, -insert_num);
+
+	RFALSE(tb->blknum[h] > 1,
+	       "tb->blknum[%d]=%d when insert_size < 0", h, tb->blknum[h]);
+
+	n = B_NR_ITEMS(tbSh);
+
+	if (tb->lnum[h] == 0 && tb->rnum[h] == 0) {
+		if (tb->blknum[h] == 0) {
+			/* node S[h] (root of the tree) is empty now */
+			struct buffer_head *new_root;
+
+			RFALSE(n
+			       || B_FREE_SPACE(tbSh) !=
+			       MAX_CHILD_SIZE(tbSh) - DC_SIZE,
+			       "buffer must have only 0 keys (%d)", n);
+			RFALSE(bi.bi_parent, "root has parent (%p)",
+			       bi.bi_parent);
+
+			/* choose a new root */
+			if (!tb->L[h - 1] || !B_NR_ITEMS(tb->L[h - 1]))
+				new_root = tb->R[h - 1];
+			else
+				new_root = tb->L[h - 1];
+			/* switch super block's tree root block number to the new value */
+			PUT_SB_ROOT_BLOCK(tb->tb_sb, new_root->b_blocknr);
+			//REISERFS_SB(tb->tb_sb)->s_rs->s_tree_height --;
+			PUT_SB_TREE_HEIGHT(tb->tb_sb,
+					   SB_TREE_HEIGHT(tb->tb_sb) - 1);
+
+			do_balance_mark_sb_dirty(tb,
+						 REISERFS_SB(tb->tb_sb)->s_sbh,
+						 1);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+			if (h > 1)
+				/* use check_internal if new root is an internal node */
+				check_internal(new_root);
+			/*&&&&&&&&&&&&&&&&&&&&&& */
+
+			/* do what is needed for buffer thrown from tree */
+			reiserfs_invalidate_buffer(tb, tbSh);
+			return;
+		}
+		return;
+	}
+
+	if (tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1) {	/* join S[h] with L[h] */
+
+		RFALSE(tb->rnum[h] != 0,
+		       "invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
+		       h, tb->rnum[h]);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+
+	if (tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1) {	/* join S[h] with R[h] */
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
+		       h, tb->lnum[h]);
+
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return;
 	}
-	return;
-    }
-
-    if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */
-
-	RFALSE( tb->rnum[h] != 0,
-		"invalid tb->rnum[%d]==%d when joining S[h] with L[h]",
-		h, tb->rnum[h]);
-
-	internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1);
-	reiserfs_invalidate_buffer(tb, tbSh);
-
-	return;
-    }
-
-    if ( tb->R[h] &&  tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */
-	RFALSE( tb->lnum[h] != 0,
-		"invalid tb->lnum[%d]==%d when joining S[h] with R[h]",
-		h, tb->lnum[h]);
-
-	internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1);
-
-	reiserfs_invalidate_buffer(tb,tbSh);
-	return;
-    }
-
-    if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */
-	RFALSE( tb->rnum[h] != 0,
-		"wrong tb->rnum[%d]==%d when borrow from L[h]", h, tb->rnum[h]);
-	/*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/
-	internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]);
-	return;
-    }
-
-    if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */
-	 RFALSE( tb->lnum[h] != 0,
-		 "invalid tb->lnum[%d]==%d when borrow from R[h]", 
-		 h, tb->lnum[h]);
-	internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/
-	return;
-    }
-
-    if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */
-	RFALSE( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
-		"invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
-		h, tb->lnum[h], h, tb->rnum[h], n);
-
-	internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/
-	internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
-
-	reiserfs_invalidate_buffer (tb, tbSh);
-
-	return;
-    }
-    reiserfs_panic (tb->tb_sb, "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
-		    h, tb->lnum[h], h, tb->rnum[h]);
-}
 
+	if (tb->lnum[h] < 0) {	/* borrow from left neighbor L[h] */
+		RFALSE(tb->rnum[h] != 0,
+		       "wrong tb->rnum[%d]==%d when borrow from L[h]", h,
+		       tb->rnum[h]);
+		/*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]); */
+		internal_shift_right(INTERNAL_SHIFT_FROM_L_TO_S, tb, h,
+				     -tb->lnum[h]);
+		return;
+	}
+
+	if (tb->rnum[h] < 0) {	/* borrow from right neighbor R[h] */
+		RFALSE(tb->lnum[h] != 0,
+		       "invalid tb->lnum[%d]==%d when borrow from R[h]",
+		       h, tb->lnum[h]);
+		internal_shift_left(INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);	/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]); */
+		return;
+	}
+
+	if (tb->lnum[h] > 0) {	/* split S[h] into two parts and put them into neighbors */
+		RFALSE(tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1,
+		       "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them",
+		       h, tb->lnum[h], h, tb->rnum[h], n);
+
+		internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);	/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]); */
+		internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+				     tb->rnum[h]);
+
+		reiserfs_invalidate_buffer(tb, tbSh);
+
+		return;
+	}
+	reiserfs_panic(tb->tb_sb,
+		       "balance_internal_when_delete: unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d",
+		       h, tb->lnum[h], h, tb->rnum[h]);
+}
 
 /* Replace delimiting key of buffers L[h] and S[h] by the given key.*/
-static void replace_lkey (
-		      struct tree_balance * tb,
-		      int h,
-		      struct item_head * key
-		      )
+static void replace_lkey(struct tree_balance *tb, int h, struct item_head *key)
 {
-   RFALSE( tb->L[h] == NULL || tb->CFL[h] == NULL,
-	   "L[h](%p) and CFL[h](%p) must exist in replace_lkey", 
-	   tb->L[h], tb->CFL[h]);
+	RFALSE(tb->L[h] == NULL || tb->CFL[h] == NULL,
+	       "L[h](%p) and CFL[h](%p) must exist in replace_lkey",
+	       tb->L[h], tb->CFL[h]);
 
-  if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
-    return;
+	if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0)
+		return;
 
-  memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE);
+	memcpy(B_N_PDELIM_KEY(tb->CFL[h], tb->lkey[h]), key, KEY_SIZE);
 
-  do_balance_mark_internal_dirty (tb, tb->CFL[h],0);
+	do_balance_mark_internal_dirty(tb, tb->CFL[h], 0);
 }
 
-
 /* Replace delimiting key of buffers S[h] and R[h] by the given key.*/
-static void replace_rkey (
-		      struct tree_balance * tb,
-		      int h,
-		      struct item_head * key
-		      )
+static void replace_rkey(struct tree_balance *tb, int h, struct item_head *key)
 {
-  RFALSE( tb->R[h] == NULL || tb->CFR[h] == NULL,
-	  "R[h](%p) and CFR[h](%p) must exist in replace_rkey", 
-	  tb->R[h], tb->CFR[h]);
-  RFALSE( B_NR_ITEMS(tb->R[h]) == 0,
-	  "R[h] can not be empty if it exists (item number=%d)", 
-	  B_NR_ITEMS(tb->R[h]));
+	RFALSE(tb->R[h] == NULL || tb->CFR[h] == NULL,
+	       "R[h](%p) and CFR[h](%p) must exist in replace_rkey",
+	       tb->R[h], tb->CFR[h]);
+	RFALSE(B_NR_ITEMS(tb->R[h]) == 0,
+	       "R[h] can not be empty if it exists (item number=%d)",
+	       B_NR_ITEMS(tb->R[h]));
 
-  memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE);
+	memcpy(B_N_PDELIM_KEY(tb->CFR[h], tb->rkey[h]), key, KEY_SIZE);
 
-  do_balance_mark_internal_dirty (tb, tb->CFR[h], 0);
+	do_balance_mark_internal_dirty(tb, tb->CFR[h], 0);
 }
 
-
-int balance_internal (struct tree_balance * tb,			/* tree_balance structure 		*/
-		      int h,					/* level of the tree 			*/
-		      int child_pos,
-		      struct item_head * insert_key,		/* key for insertion on higher level   	*/
-		      struct buffer_head ** insert_ptr	/* node for insertion on higher level*/
+int balance_internal(struct tree_balance *tb,	/* tree_balance structure               */
+		     int h,	/* level of the tree                    */
+		     int child_pos, struct item_head *insert_key,	/* key for insertion on higher level    */
+		     struct buffer_head **insert_ptr	/* node for insertion on higher level */
     )
     /* if inserting/pasting
        {
-       child_pos is the position of the node-pointer in S[h] that	 *
-       pointed to S[h-1] before balancing of the h-1 level;		 *
+       child_pos is the position of the node-pointer in S[h] that        *
+       pointed to S[h-1] before balancing of the h-1 level;              *
        this means that new pointers and items must be inserted AFTER *
        child_pos
        }
        else 
        {
-   it is the position of the leftmost pointer that must be deleted (together with
-   its corresponding key to the left of the pointer)
-   as a result of the previous level's balancing.
-   }
-*/
+       it is the position of the leftmost pointer that must be deleted (together with
+       its corresponding key to the left of the pointer)
+       as a result of the previous level's balancing.
+       }
+     */
 {
-    struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h);
-    struct buffer_info bi;
-    int order;		/* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
-    int insert_num, n, k;
-    struct buffer_head * S_new;
-    struct item_head new_insert_key;
-    struct buffer_head * new_insert_ptr = NULL;
-    struct item_head * new_insert_key_addr = insert_key;
-
-    RFALSE( h < 1, "h (%d) can not be < 1 on internal level", h);
-
-    PROC_INFO_INC( tb -> tb_sb, balance_at[ h ] );
-
-    order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0;
-
-  /* Using insert_size[h] calculate the number insert_num of items
-     that must be inserted to or deleted from S[h]. */
-    insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE));
-
-    /* Check whether insert_num is proper **/
-    RFALSE( insert_num < -2  ||  insert_num > 2,
-	    "incorrect number of items inserted to the internal node (%d)", 
-	    insert_num);
-    RFALSE( h > 1  && (insert_num > 1 || insert_num < -1),
-	    "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", 
-	    insert_num, h);
-
-    /* Make balance in case insert_num < 0 */
-    if ( insert_num < 0 ) {
-	balance_internal_when_delete (tb, h, child_pos);
-	return order;
-    }
- 
-    k = 0;
-    if ( tb->lnum[h] > 0 ) {
-	/* shift lnum[h] items from S[h] to the left neighbor L[h].
-	   check how many of new items fall into L[h] or CFL[h] after
-	   shifting */
-	n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */
-	if ( tb->lnum[h] <= child_pos ) {
-	    /* new items don't fall into L[h] or CFL[h] */
-	    internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);
-	    /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/
-	    child_pos -= tb->lnum[h];
-	} else if ( tb->lnum[h] > child_pos + insert_num ) {
-	    /* all new items fall into L[h] */
-	    internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num);
-	    /*			internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
-				tb->lnum[h]-insert_num);
-	    */
-	    /* insert insert_num keys and node-pointers into L[h] */
-	    bi.tb = tb;
-	    bi.bi_bh = tb->L[h];
-	    bi.bi_parent = tb->FL[h];
-	    bi.bi_position = get_left_neighbor_position (tb, h);
-	    internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1,
-				    insert_num,insert_key,insert_ptr);
-
-	    insert_num = 0; 
-	} else {
-	    struct disk_child * dc;
-
-	    /* some items fall into L[h] or CFL[h], but some don't fall */
-	    internal_shift1_left(tb,h,child_pos+1);
-	    /* calculate number of new items that fall into L[h] */
-	    k = tb->lnum[h] - child_pos - 1;
-	    bi.tb = tb;
-	    bi.bi_bh = tb->L[h];
-	    bi.bi_parent = tb->FL[h];
-	    bi.bi_position = get_left_neighbor_position (tb, h);
-	    internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k,
-				    insert_key,insert_ptr);
-
-	    replace_lkey(tb,h,insert_key + k);
-
-	    /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
-	    dc = B_N_CHILD(tbSh, 0);
-	    put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k]));
-	    put_dc_block_number( dc, insert_ptr[k]->b_blocknr );
-
-	    do_balance_mark_internal_dirty (tb, tbSh, 0);
-
-	    k++;
-	    insert_key += k;
-	    insert_ptr += k;
-	    insert_num -= k;
-	    child_pos = 0;
+	struct buffer_head *tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+	struct buffer_info bi;
+	int order;		/* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */
+	int insert_num, n, k;
+	struct buffer_head *S_new;
+	struct item_head new_insert_key;
+	struct buffer_head *new_insert_ptr = NULL;
+	struct item_head *new_insert_key_addr = insert_key;
+
+	RFALSE(h < 1, "h (%d) can not be < 1 on internal level", h);
+
+	PROC_INFO_INC(tb->tb_sb, balance_at[h]);
+
+	order =
+	    (tbSh) ? PATH_H_POSITION(tb->tb_path,
+				     h + 1) /*tb->S[h]->b_item_order */ : 0;
+
+	/* Using insert_size[h] calculate the number insert_num of items
+	   that must be inserted to or deleted from S[h]. */
+	insert_num = tb->insert_size[h] / ((int)(KEY_SIZE + DC_SIZE));
+
+	/* Check whether insert_num is proper * */
+	RFALSE(insert_num < -2 || insert_num > 2,
+	       "incorrect number of items inserted to the internal node (%d)",
+	       insert_num);
+	RFALSE(h > 1 && (insert_num > 1 || insert_num < -1),
+	       "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level",
+	       insert_num, h);
+
+	/* Make balance in case insert_num < 0 */
+	if (insert_num < 0) {
+		balance_internal_when_delete(tb, h, child_pos);
+		return order;
 	}
-    }	/* tb->lnum[h] > 0 */
-
-    if ( tb->rnum[h] > 0 ) {
-	/*shift rnum[h] items from S[h] to the right neighbor R[h]*/
-	/* check how many of new items fall into R or CFR after shifting */
-	n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
-	if ( n - tb->rnum[h] >= child_pos )
-	    /* new items fall into S[h] */
-	    /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/
-	    internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]);
-	else
-	    if ( n + insert_num - tb->rnum[h] < child_pos )
-	    {
-		/* all new items fall into R[h] */
-		/*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
-	    tb->rnum[h] - insert_num);*/
-		internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num);
-
-		/* insert insert_num keys and node-pointers into R[h] */
-		bi.tb = tb;
-		bi.bi_bh = tb->R[h];
-		bi.bi_parent = tb->FR[h];
-		bi.bi_position = get_right_neighbor_position (tb, h);
-		internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1,
-					insert_num,insert_key,insert_ptr);
-		insert_num = 0;
-	    }
-	    else
-	    {
-		struct disk_child * dc;
-
-		/* one of the items falls into CFR[h] */
-		internal_shift1_right(tb,h,n - child_pos + 1);
-		/* calculate number of new items that fall into R[h] */
-		k = tb->rnum[h] - n + child_pos - 1;
-		bi.tb = tb;
-		bi.bi_bh = tb->R[h];
-		bi.bi_parent = tb->FR[h];
-		bi.bi_position = get_right_neighbor_position (tb, h);
-		internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1);
 
-		replace_rkey(tb,h,insert_key + insert_num - k - 1);
+	k = 0;
+	if (tb->lnum[h] > 0) {
+		/* shift lnum[h] items from S[h] to the left neighbor L[h].
+		   check how many of new items fall into L[h] or CFL[h] after
+		   shifting */
+		n = B_NR_ITEMS(tb->L[h]);	/* number of items in L[h] */
+		if (tb->lnum[h] <= child_pos) {
+			/* new items don't fall into L[h] or CFL[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h]);
+			/*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]); */
+			child_pos -= tb->lnum[h];
+		} else if (tb->lnum[h] > child_pos + insert_num) {
+			/* all new items fall into L[h] */
+			internal_shift_left(INTERNAL_SHIFT_FROM_S_TO_L, tb, h,
+					    tb->lnum[h] - insert_num);
+			/*                  internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,
+			   tb->lnum[h]-insert_num);
+			 */
+			/* insert insert_num keys and node-pointers into L[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next */
+					       n + child_pos + 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* some items fall into L[h] or CFL[h], but some don't fall */
+			internal_shift1_left(tb, h, child_pos + 1);
+			/* calculate number of new items that fall into L[h] */
+			k = tb->lnum[h] - child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->L[h];
+			bi.bi_parent = tb->FL[h];
+			bi.bi_position = get_left_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->L[h], tb->S[h-1]->b_next, */
+					       n + child_pos + 1, k,
+					       insert_key, insert_ptr);
+
+			replace_lkey(tb, h, insert_key + k);
+
+			/* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */
+			dc = B_N_CHILD(tbSh, 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr[k]) -
+				    B_FREE_SPACE(insert_ptr[k]));
+			put_dc_block_number(dc, insert_ptr[k]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tbSh, 0);
+
+			k++;
+			insert_key += k;
+			insert_ptr += k;
+			insert_num -= k;
+			child_pos = 0;
+		}
+	}
+	/* tb->lnum[h] > 0 */
+	if (tb->rnum[h] > 0) {
+		/*shift rnum[h] items from S[h] to the right neighbor R[h] */
+		/* check how many of new items fall into R or CFR after shifting */
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		if (n - tb->rnum[h] >= child_pos)
+			/* new items fall into S[h] */
+			/*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]); */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h]);
+		else if (n + insert_num - tb->rnum[h] < child_pos) {
+			/* all new items fall into R[h] */
+			/*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],
+			   tb->rnum[h] - insert_num); */
+			internal_shift_right(INTERNAL_SHIFT_FROM_S_TO_R, tb, h,
+					     tb->rnum[h] - insert_num);
+
+			/* insert insert_num keys and node-pointers into R[h] */
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h],tb->S[h-1]->b_next */
+					       child_pos - n - insert_num +
+					       tb->rnum[h] - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* one of the items falls into CFR[h] */
+			internal_shift1_right(tb, h, n - child_pos + 1);
+			/* calculate number of new items that fall into R[h] */
+			k = tb->rnum[h] - n + child_pos - 1;
+			bi.tb = tb;
+			bi.bi_bh = tb->R[h];
+			bi.bi_parent = tb->FR[h];
+			bi.bi_position = get_right_neighbor_position(tb, h);
+			internal_insert_childs(&bi,
+					       /*tb->R[h], tb->R[h]->b_child, */
+					       0, k, insert_key + 1,
+					       insert_ptr + 1);
+
+			replace_rkey(tb, h, insert_key + insert_num - k - 1);
+
+			/* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1] */
+			dc = B_N_CHILD(tb->R[h], 0);
+			put_dc_size(dc,
+				    MAX_CHILD_SIZE(insert_ptr
+						   [insert_num - k - 1]) -
+				    B_FREE_SPACE(insert_ptr
+						 [insert_num - k - 1]));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, tb->R[h], 0);
+
+			insert_num -= (k + 1);
+		}
+	}
 
-		/* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/
-		dc = B_N_CHILD(tb->R[h], 0);
-		put_dc_size( dc, MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
-    				    B_FREE_SPACE (insert_ptr[insert_num-k-1]));
-		put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
+    /** Fill new node that appears instead of S[h] **/
+	RFALSE(tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
+	RFALSE(tb->blknum[h] < 0, "blknum can not be < 0");
 
-		do_balance_mark_internal_dirty (tb, tb->R[h],0);
+	if (!tb->blknum[h]) {	/* node S[h] is empty now */
+		RFALSE(!tbSh, "S[h] is equal NULL");
 
-		insert_num -= (k + 1);
-	    }
-    }
+		/* do what is needed for buffer thrown from tree */
+		reiserfs_invalidate_buffer(tb, tbSh);
+		return order;
+	}
 
-    /** Fill new node that appears instead of S[h] **/
-    RFALSE( tb->blknum[h] > 2, "blknum can not be > 2 for internal level");
-    RFALSE( tb->blknum[h] < 0, "blknum can not be < 0");
+	if (!tbSh) {
+		/* create new root */
+		struct disk_child *dc;
+		struct buffer_head *tbSh_1 = PATH_H_PBUFFER(tb->tb_path, h - 1);
+		struct block_head *blkh;
 
-    if ( ! tb->blknum[h] )
-    { /* node S[h] is empty now */
-	RFALSE( ! tbSh, "S[h] is equal NULL");
+		if (tb->blknum[h] != 1)
+			reiserfs_panic(NULL,
+				       "balance_internal: One new node required for creating the new root");
+		/* S[h] = empty buffer from the list FEB. */
+		tbSh = get_FEB(tb);
+		blkh = B_BLK_HEAD(tbSh);
+		set_blkh_level(blkh, h + 1);
 
-	/* do what is needed for buffer thrown from tree */
-	reiserfs_invalidate_buffer(tb,tbSh);
-	return order;
-    }
-
-    if ( ! tbSh ) {
-	/* create new root */
-	struct disk_child  * dc;
-	struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1);
-        struct block_head *  blkh;
-
-
-	if ( tb->blknum[h] != 1 )
-	    reiserfs_panic(NULL, "balance_internal: One new node required for creating the new root");
-	/* S[h] = empty buffer from the list FEB. */
-	tbSh = get_FEB (tb);
-        blkh = B_BLK_HEAD(tbSh);
-        set_blkh_level( blkh, h + 1 );
-
-	/* Put the unique node-pointer to S[h] that points to S[h-1]. */
-
-	dc = B_N_CHILD(tbSh, 0);
-	put_dc_block_number( dc, tbSh_1->b_blocknr );
-	put_dc_size( dc, (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1)));
-
-	tb->insert_size[h] -= DC_SIZE;
-        set_blkh_free_space( blkh, blkh_free_space(blkh) - DC_SIZE );
-
-	do_balance_mark_internal_dirty (tb, tbSh, 0);
-
-	/*&&&&&&&&&&&&&&&&&&&&&&&&*/
-	check_internal (tbSh);
-	/*&&&&&&&&&&&&&&&&&&&&&&&&*/
-    
-    /* put new root into path structure */
-	PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh;
-
-	/* Change root in structure super block. */
-        PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
-        PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
-	do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-    }
-	
-    if ( tb->blknum[h] == 2 ) {
-	int snum;
-	struct buffer_info dest_bi, src_bi;
+		/* Put the unique node-pointer to S[h] that points to S[h-1]. */
+
+		dc = B_N_CHILD(tbSh, 0);
+		put_dc_block_number(dc, tbSh_1->b_blocknr);
+		put_dc_size(dc,
+			    (MAX_CHILD_SIZE(tbSh_1) - B_FREE_SPACE(tbSh_1)));
+
+		tb->insert_size[h] -= DC_SIZE;
+		set_blkh_free_space(blkh, blkh_free_space(blkh) - DC_SIZE);
 
+		do_balance_mark_internal_dirty(tb, tbSh, 0);
 
-	/* S_new = free buffer from list FEB */
-	S_new = get_FEB(tb);
-
-        set_blkh_level( B_BLK_HEAD(S_new), h + 1 );
-
-	dest_bi.tb = tb;
-	dest_bi.bi_bh = S_new;
-	dest_bi.bi_parent = NULL;
-	dest_bi.bi_position = 0;
-	src_bi.tb = tb;
-	src_bi.bi_bh = tbSh;
-	src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-		
-	n = B_NR_ITEMS (tbSh); /* number of items in S[h] */
-	snum = (insert_num + n + 1)/2;
-	if ( n - snum >= child_pos ) {
-	    /* new items don't fall into S_new */
-	    /*	store the delimiting key for the next level */
-	    /* new_insert_key = (n - snum)'th key in S[h] */
-	    memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum),
-		    KEY_SIZE);
-	    /* last parameter is del_par */
-	    internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0);
-	    /*            internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/
-	} else if ( n + insert_num - snum < child_pos ) {
-	    /* all new items fall into S_new */
-	    /*	store the delimiting key for the next level */
-	    /* new_insert_key = (n + insert_item - snum)'th key in S[h] */
-	    memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum),
-		   KEY_SIZE);
-	    /* last parameter is del_par */
-	    internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0);
-	    /*			internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/
-
-	    /* insert insert_num keys and node-pointers into S_new */
-	    internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1,
-				    insert_num,insert_key,insert_ptr);
-
-	    insert_num = 0;
-	} else {
-	    struct disk_child * dc;
-
-	    /* some items fall into S_new, but some don't fall */
-	    /* last parameter is del_par */
-	    internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1);
-	    /*			internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/
-	    /* calculate number of new items that fall into S_new */
-	    k = snum - n + child_pos - 1;
-
-	    internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1);
-
-	    /* new_insert_key = insert_key[insert_num - k - 1] */
-	    memcpy(&new_insert_key,insert_key + insert_num - k - 1,
-		   KEY_SIZE);
-	    /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
-
-	    dc = B_N_CHILD(S_new,0);
-	    put_dc_size( dc, (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) -
-				B_FREE_SPACE(insert_ptr[insert_num-k-1])) );
-	    put_dc_block_number( dc, insert_ptr[insert_num-k-1]->b_blocknr );
-
-	    do_balance_mark_internal_dirty (tb, S_new,0);
-			
-	    insert_num -= (k + 1);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+		check_internal(tbSh);
+		/*&&&&&&&&&&&&&&&&&&&&&&&& */
+
+		/* put new root into path structure */
+		PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) =
+		    tbSh;
+
+		/* Change root in structure super block. */
+		PUT_SB_ROOT_BLOCK(tb->tb_sb, tbSh->b_blocknr);
+		PUT_SB_TREE_HEIGHT(tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1);
+		do_balance_mark_sb_dirty(tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
 	}
-	/* new_insert_ptr = node_pointer to S_new */
-	new_insert_ptr = S_new;
-
-	RFALSE (!buffer_journaled(S_new) || buffer_journal_dirty(S_new) ||
-		buffer_dirty (S_new),
-		"cm-00001: bad S_new (%b)", S_new);
-
-	// S_new is released in unfix_nodes
-    }
-
-    n = B_NR_ITEMS (tbSh); /*number of items in S[h] */
-
-	if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) {
-	    bi.tb = tb;
-	    bi.bi_bh = tbSh;
-	    bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h);
-	    bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1);
-		internal_insert_childs (
-		    &bi,/*tbSh,*/
-		    /*		( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next,*/
-		    child_pos,insert_num,insert_key,insert_ptr
-		    );
+
+	if (tb->blknum[h] == 2) {
+		int snum;
+		struct buffer_info dest_bi, src_bi;
+
+		/* S_new = free buffer from list FEB */
+		S_new = get_FEB(tb);
+
+		set_blkh_level(B_BLK_HEAD(S_new), h + 1);
+
+		dest_bi.tb = tb;
+		dest_bi.bi_bh = S_new;
+		dest_bi.bi_parent = NULL;
+		dest_bi.bi_position = 0;
+		src_bi.tb = tb;
+		src_bi.bi_bh = tbSh;
+		src_bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		src_bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+
+		n = B_NR_ITEMS(tbSh);	/* number of items in S[h] */
+		snum = (insert_num + n + 1) / 2;
+		if (n - snum >= child_pos) {
+			/* new items don't fall into S_new */
+			/*  store the delimiting key for the next level */
+			/* new_insert_key = (n - snum)'th key in S[h] */
+			memcpy(&new_insert_key, B_N_PDELIM_KEY(tbSh, n - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST, snum, 0);
+			/*            internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0); */
+		} else if (n + insert_num - snum < child_pos) {
+			/* all new items fall into S_new */
+			/*  store the delimiting key for the next level */
+			/* new_insert_key = (n + insert_item - snum)'th key in S[h] */
+			memcpy(&new_insert_key,
+			       B_N_PDELIM_KEY(tbSh, n + insert_num - snum),
+			       KEY_SIZE);
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     snum - insert_num, 0);
+			/*                  internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0); */
+
+			/* insert insert_num keys and node-pointers into S_new */
+			internal_insert_childs(&dest_bi,
+					       /*S_new,tb->S[h-1]->b_next, */
+					       child_pos - n - insert_num +
+					       snum - 1,
+					       insert_num, insert_key,
+					       insert_ptr);
+
+			insert_num = 0;
+		} else {
+			struct disk_child *dc;
+
+			/* some items fall into S_new, but some don't fall */
+			/* last parameter is del_par */
+			internal_move_pointers_items(&dest_bi, &src_bi,
+						     LAST_TO_FIRST,
+						     n - child_pos + 1, 1);
+			/*                  internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1); */
+			/* calculate number of new items that fall into S_new */
+			k = snum - n + child_pos - 1;
+
+			internal_insert_childs(&dest_bi, /*S_new, */ 0, k,
+					       insert_key + 1, insert_ptr + 1);
+
+			/* new_insert_key = insert_key[insert_num - k - 1] */
+			memcpy(&new_insert_key, insert_key + insert_num - k - 1,
+			       KEY_SIZE);
+			/* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */
+
+			dc = B_N_CHILD(S_new, 0);
+			put_dc_size(dc,
+				    (MAX_CHILD_SIZE
+				     (insert_ptr[insert_num - k - 1]) -
+				     B_FREE_SPACE(insert_ptr
+						  [insert_num - k - 1])));
+			put_dc_block_number(dc,
+					    insert_ptr[insert_num - k -
+						       1]->b_blocknr);
+
+			do_balance_mark_internal_dirty(tb, S_new, 0);
+
+			insert_num -= (k + 1);
+		}
+		/* new_insert_ptr = node_pointer to S_new */
+		new_insert_ptr = S_new;
+
+		RFALSE(!buffer_journaled(S_new) || buffer_journal_dirty(S_new)
+		       || buffer_dirty(S_new), "cm-00001: bad S_new (%b)",
+		       S_new);
+
+		// S_new is released in unfix_nodes
 	}
 
+	n = B_NR_ITEMS(tbSh);	/*number of items in S[h] */
 
-	memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE);
+	if (0 <= child_pos && child_pos <= n && insert_num > 0) {
+		bi.tb = tb;
+		bi.bi_bh = tbSh;
+		bi.bi_parent = PATH_H_PPARENT(tb->tb_path, h);
+		bi.bi_position = PATH_H_POSITION(tb->tb_path, h + 1);
+		internal_insert_childs(&bi,	/*tbSh, */
+				       /*          ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next :  tb->S[h]->b_child->b_next, */
+				       child_pos, insert_num, insert_key,
+				       insert_ptr);
+	}
+
+	memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
 	insert_ptr[0] = new_insert_ptr;
 
 	return order;
-    }
-
-  
-    
+}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 289d864fe731..1aaf2c7d44e6 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,107 +18,109 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 
-extern int reiserfs_default_io_size; /* default io size devuned in super.c */
+extern int reiserfs_default_io_size;	/* default io size devuned in super.c */
 
 static int reiserfs_commit_write(struct file *f, struct page *page,
-                                 unsigned from, unsigned to);
+				 unsigned from, unsigned to);
 static int reiserfs_prepare_write(struct file *f, struct page *page,
 				  unsigned from, unsigned to);
 
-void reiserfs_delete_inode (struct inode * inode)
+void reiserfs_delete_inode(struct inode *inode)
 {
-    /* We need blocks for transaction + (user+group) quota update (possibly delete) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
-    struct reiserfs_transaction_handle th ;
-  
-    reiserfs_write_lock(inode->i_sb);
+	/* We need blocks for transaction + (user+group) quota update (possibly delete) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb);
+	struct reiserfs_transaction_handle th;
 
-    /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
-    if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */
-	down (&inode->i_sem); 
+	reiserfs_write_lock(inode->i_sb);
 
-	reiserfs_delete_xattrs (inode);
+	/* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */
+	if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) {	/* also handles bad_inode case */
+		down(&inode->i_sem);
 
-	if (journal_begin(&th, inode->i_sb, jbegin_count)) {
-	    up (&inode->i_sem);
-	    goto out;
-	}
-	reiserfs_update_inode_transaction(inode) ;
+		reiserfs_delete_xattrs(inode);
 
-	if (reiserfs_delete_object (&th, inode)) {
-	    up (&inode->i_sem);
-	    goto out;
-	}
+		if (journal_begin(&th, inode->i_sb, jbegin_count)) {
+			up(&inode->i_sem);
+			goto out;
+		}
+		reiserfs_update_inode_transaction(inode);
 
-	/* Do quota update inside a transaction for journaled quotas. We must do that
-	 * after delete_object so that quota updates go into the same transaction as
-	 * stat data deletion */
-	DQUOT_FREE_INODE(inode);
+		if (reiserfs_delete_object(&th, inode)) {
+			up(&inode->i_sem);
+			goto out;
+		}
 
-	if (journal_end(&th, inode->i_sb, jbegin_count)) {
-	    up (&inode->i_sem);
-	    goto out;
-	}
+		/* Do quota update inside a transaction for journaled quotas. We must do that
+		 * after delete_object so that quota updates go into the same transaction as
+		 * stat data deletion */
+		DQUOT_FREE_INODE(inode);
+
+		if (journal_end(&th, inode->i_sb, jbegin_count)) {
+			up(&inode->i_sem);
+			goto out;
+		}
 
-        up (&inode->i_sem);
+		up(&inode->i_sem);
 
-        /* all items of file are deleted, so we can remove "save" link */
-	remove_save_link (inode, 0/* not truncate */); /* we can't do anything
-                                                        * about an error here */
-    } else {
-	/* no object items are in the tree */
-	;
-    }
-out:
-    clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */
-    inode->i_blocks = 0;
-    reiserfs_write_unlock(inode->i_sb);
+		/* all items of file are deleted, so we can remove "save" link */
+		remove_save_link(inode, 0 /* not truncate */ );	/* we can't do anything
+								 * about an error here */
+	} else {
+		/* no object items are in the tree */
+		;
+	}
+      out:
+	clear_inode(inode);	/* note this must go after the journal_end to prevent deadlock */
+	inode->i_blocks = 0;
+	reiserfs_write_unlock(inode->i_sb);
 }
 
-static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, 
-	       loff_t offset, int type, int length )
+static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid,
+			  __u32 objectid, loff_t offset, int type, int length)
 {
-    key->version = version;
+	key->version = version;
 
-    key->on_disk_key.k_dir_id = dirid;
-    key->on_disk_key.k_objectid = objectid;
-    set_cpu_key_k_offset (key, offset);
-    set_cpu_key_k_type (key, type);  
-    key->key_length = length;
+	key->on_disk_key.k_dir_id = dirid;
+	key->on_disk_key.k_objectid = objectid;
+	set_cpu_key_k_offset(key, offset);
+	set_cpu_key_k_type(key, type);
+	key->key_length = length;
 }
 
-
 /* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set
    offset and type of key */
-void make_cpu_key (struct cpu_key * key, struct inode * inode, loff_t offset,
-	      int type, int length )
+void make_cpu_key(struct cpu_key *key, struct inode *inode, loff_t offset,
+		  int type, int length)
 {
-  _make_cpu_key (key, get_inode_item_key_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id),
-		 le32_to_cpu (INODE_PKEY (inode)->k_objectid), 
-		 offset, type, length);
+	_make_cpu_key(key, get_inode_item_key_version(inode),
+		      le32_to_cpu(INODE_PKEY(inode)->k_dir_id),
+		      le32_to_cpu(INODE_PKEY(inode)->k_objectid), offset, type,
+		      length);
 }
 
-
 //
 // when key is 0, do not set version and short key
 //
-inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key,
-			       int version,
-			       loff_t offset, int type, int length, 
-			       int entry_count/*or ih_free_space*/)
+inline void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+			      int version,
+			      loff_t offset, int type, int length,
+			      int entry_count /*or ih_free_space */ )
 {
-    if (key) {
-	ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id);
-	ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid);
-    }
-    put_ih_version( ih, version );
-    set_le_ih_k_offset (ih, offset);
-    set_le_ih_k_type (ih, type);
-    put_ih_item_len( ih, length );
-    /*    set_ih_free_space (ih, 0);*/
-    // for directory items it is entry count, for directs and stat
-    // datas - 0xffff, for indirects - 0
-    put_ih_entry_count( ih, entry_count );
+	if (key) {
+		ih->ih_key.k_dir_id = cpu_to_le32(key->on_disk_key.k_dir_id);
+		ih->ih_key.k_objectid =
+		    cpu_to_le32(key->on_disk_key.k_objectid);
+	}
+	put_ih_version(ih, version);
+	set_le_ih_k_offset(ih, offset);
+	set_le_ih_k_type(ih, type);
+	put_ih_item_len(ih, length);
+	/*    set_ih_free_space (ih, 0); */
+	// for directory items it is entry count, for directs and stat
+	// datas - 0xffff, for indirects - 0
+	put_ih_entry_count(ih, entry_count);
 }
 
 //
@@ -153,84 +155,84 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key
 ** to be unmapped, so that block_prepare_write will correctly call
 ** reiserfs_get_block to convert the tail into an unformatted node
 */
-static inline void fix_tail_page_for_writing(struct page *page) {
-    struct buffer_head *head, *next, *bh ;
-
-    if (page && page_has_buffers(page)) {
-	head = page_buffers(page) ;
-	bh = head ;
-	do {
-	    next = bh->b_this_page ;
-	    if (buffer_mapped(bh) && bh->b_blocknr == 0) {
-	        reiserfs_unmap_buffer(bh) ;
-	    }
-	    bh = next ;
-	} while (bh != head) ;
-    }
+static inline void fix_tail_page_for_writing(struct page *page)
+{
+	struct buffer_head *head, *next, *bh;
+
+	if (page && page_has_buffers(page)) {
+		head = page_buffers(page);
+		bh = head;
+		do {
+			next = bh->b_this_page;
+			if (buffer_mapped(bh) && bh->b_blocknr == 0) {
+				reiserfs_unmap_buffer(bh);
+			}
+			bh = next;
+		} while (bh != head);
+	}
 }
 
 /* reiserfs_get_block does not need to allocate a block only if it has been
    done already or non-hole position has been found in the indirect item */
-static inline int allocation_needed (int retval, b_blocknr_t allocated, 
-				     struct item_head * ih,
-				     __le32 * item, int pos_in_item)
+static inline int allocation_needed(int retval, b_blocknr_t allocated,
+				    struct item_head *ih,
+				    __le32 * item, int pos_in_item)
 {
-  if (allocated)
-	 return 0;
-  if (retval == POSITION_FOUND && is_indirect_le_ih (ih) && 
-      get_block_num(item, pos_in_item))
-	 return 0;
-  return 1;
+	if (allocated)
+		return 0;
+	if (retval == POSITION_FOUND && is_indirect_le_ih(ih) &&
+	    get_block_num(item, pos_in_item))
+		return 0;
+	return 1;
 }
 
-static inline int indirect_item_found (int retval, struct item_head * ih)
+static inline int indirect_item_found(int retval, struct item_head *ih)
 {
-  return (retval == POSITION_FOUND) && is_indirect_le_ih (ih);
+	return (retval == POSITION_FOUND) && is_indirect_le_ih(ih);
 }
 
-
-static inline void set_block_dev_mapped (struct buffer_head * bh, 
-					 b_blocknr_t block, struct inode * inode)
+static inline void set_block_dev_mapped(struct buffer_head *bh,
+					b_blocknr_t block, struct inode *inode)
 {
 	map_bh(bh, inode->i_sb, block);
 }
 
-
 //
 // files which were created in the earlier version can not be longer,
 // than 2 gb
 //
-static int file_capable (struct inode * inode, long block)
+static int file_capable(struct inode *inode, long block)
 {
-    if (get_inode_item_key_version (inode) != KEY_FORMAT_3_5 || // it is new file.
-	block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb
-	return 1;
+	if (get_inode_item_key_version(inode) != KEY_FORMAT_3_5 ||	// it is new file.
+	    block < (1 << (31 - inode->i_sb->s_blocksize_bits)))	// old file, but 'block' is inside of 2gb
+		return 1;
 
-    return 0;
+	return 0;
 }
 
 /*static*/ int restart_transaction(struct reiserfs_transaction_handle *th,
-				struct inode *inode, struct path *path) {
-  struct super_block *s = th->t_super ;
-  int len = th->t_blocks_allocated ;
-  int err;
-
-  BUG_ON (!th->t_trans_id);
-  BUG_ON (!th->t_refcount);
-
-  /* we cannot restart while nested */
-  if (th->t_refcount > 1) {
-      return 0  ;
-  }
-  pathrelse(path) ;
-  reiserfs_update_sd(th, inode) ;
-  err = journal_end(th, s, len) ;
-  if (!err) {
-      err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6) ;
-      if (!err)
-        reiserfs_update_inode_transaction(inode) ;
-  }
-  return err;
+				   struct inode *inode, struct path *path)
+{
+	struct super_block *s = th->t_super;
+	int len = th->t_blocks_allocated;
+	int err;
+
+	BUG_ON(!th->t_trans_id);
+	BUG_ON(!th->t_refcount);
+
+	/* we cannot restart while nested */
+	if (th->t_refcount > 1) {
+		return 0;
+	}
+	pathrelse(path);
+	reiserfs_update_sd(th, inode);
+	err = journal_end(th, s, len);
+	if (!err) {
+		err = journal_begin(th, s, JOURNAL_PER_BALANCE_CNT * 6);
+		if (!err)
+			reiserfs_update_inode_transaction(inode);
+	}
+	return err;
 }
 
 // it is called by get_block when create == 0. Returns block number
@@ -241,190 +243,192 @@ static int file_capable (struct inode * inode, long block)
 // Please improve the english/clarity in the comment above, as it is
 // hard to understand.
 
-static int _get_block_create_0 (struct inode * inode, long block,
-				 struct buffer_head * bh_result,
-				 int args)
+static int _get_block_create_0(struct inode *inode, long block,
+			       struct buffer_head *bh_result, int args)
 {
-    INITIALIZE_PATH (path);
-    struct cpu_key key;
-    struct buffer_head * bh;
-    struct item_head * ih, tmp_ih;
-    int fs_gen ;
-    int blocknr;
-    char * p = NULL;
-    int chars;
-    int ret ;
-    int result ;
-    int done = 0 ;
-    unsigned long offset ;
-
-    // prepare the key to look for the 'block'-th block of file
-    make_cpu_key (&key, inode,
-		  (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3);
-
-research:
-    result = search_for_position_by_key (inode->i_sb, &key, &path) ;
-    if (result != POSITION_FOUND) {
-	pathrelse (&path);
-        if (p)
-            kunmap(bh_result->b_page) ;
-	if (result == IO_ERROR)
-	    return -EIO;
-	// We do not return -ENOENT if there is a hole but page is uptodate, because it means
-	// That there is some MMAPED data associated with it that is yet to be written to disk.
-	if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
-	    return -ENOENT ;
-	}
-        return 0 ;
-    }
-    
-    //
-    bh = get_last_bh (&path);
-    ih = get_ih (&path);
-    if (is_indirect_le_ih (ih)) {
-	__le32 * ind_item = (__le32 *)B_I_PITEM (bh, ih);
-	
-	/* FIXME: here we could cache indirect item or part of it in
-	   the inode to avoid search_by_key in case of subsequent
-	   access to file */
-	blocknr = get_block_num(ind_item, path.pos_in_item) ;
-	ret = 0 ;
-	if (blocknr) {
-	    map_bh(bh_result, inode->i_sb, blocknr);
-	    if (path.pos_in_item == ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
-		set_buffer_boundary(bh_result);
-	    }
-	} else 
-	    // We do not return -ENOENT if there is a hole but page is uptodate, because it means
-	    // That there is some MMAPED data associated with it that is yet to  be written to disk.
-	    if ((args & GET_BLOCK_NO_HOLE) && !PageUptodate(bh_result->b_page) ) {
-	    ret = -ENOENT ;
-	    }
-
-	pathrelse (&path);
-        if (p)
-            kunmap(bh_result->b_page) ;
-	return ret ;
-    }
-
-    // requested data are in direct item(s)
-    if (!(args & GET_BLOCK_READ_DIRECT)) {
-	// we are called by bmap. FIXME: we can not map block of file
-	// when it is stored in direct item(s)
-	pathrelse (&path);	
-        if (p)
-            kunmap(bh_result->b_page) ;
-	return -ENOENT;
-    }
-
-    /* if we've got a direct item, and the buffer or page was uptodate,
-    ** we don't want to pull data off disk again.  skip to the
-    ** end, where we map the buffer and return
-    */
-    if (buffer_uptodate(bh_result)) {
-        goto finished ;
-    } else 
-	/*
-	** grab_tail_page can trigger calls to reiserfs_get_block on up to date
-	** pages without any buffers.  If the page is up to date, we don't want
-	** read old data off disk.  Set the up to date bit on the buffer instead
-	** and jump to the end
-	*/
-	    if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
+	INITIALIZE_PATH(path);
+	struct cpu_key key;
+	struct buffer_head *bh;
+	struct item_head *ih, tmp_ih;
+	int fs_gen;
+	int blocknr;
+	char *p = NULL;
+	int chars;
+	int ret;
+	int result;
+	int done = 0;
+	unsigned long offset;
+
+	// prepare the key to look for the 'block'-th block of file
+	make_cpu_key(&key, inode,
+		     (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
+		     3);
+
+      research:
+	result = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (result != POSITION_FOUND) {
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		if (result == IO_ERROR)
+			return -EIO;
+		// We do not return -ENOENT if there is a hole but page is uptodate, because it means
+		// That there is some MMAPED data associated with it that is yet to be written to disk.
+		if ((args & GET_BLOCK_NO_HOLE)
+		    && !PageUptodate(bh_result->b_page)) {
+			return -ENOENT;
+		}
+		return 0;
+	}
+	//
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	if (is_indirect_le_ih(ih)) {
+		__le32 *ind_item = (__le32 *) B_I_PITEM(bh, ih);
+
+		/* FIXME: here we could cache indirect item or part of it in
+		   the inode to avoid search_by_key in case of subsequent
+		   access to file */
+		blocknr = get_block_num(ind_item, path.pos_in_item);
+		ret = 0;
+		if (blocknr) {
+			map_bh(bh_result, inode->i_sb, blocknr);
+			if (path.pos_in_item ==
+			    ((ih_item_len(ih) / UNFM_P_SIZE) - 1)) {
+				set_buffer_boundary(bh_result);
+			}
+		} else
+			// We do not return -ENOENT if there is a hole but page is uptodate, because it means
+			// That there is some MMAPED data associated with it that is yet to  be written to disk.
+		if ((args & GET_BLOCK_NO_HOLE)
+			    && !PageUptodate(bh_result->b_page)) {
+			ret = -ENOENT;
+		}
+
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return ret;
+	}
+	// requested data are in direct item(s)
+	if (!(args & GET_BLOCK_READ_DIRECT)) {
+		// we are called by bmap. FIXME: we can not map block of file
+		// when it is stored in direct item(s)
+		pathrelse(&path);
+		if (p)
+			kunmap(bh_result->b_page);
+		return -ENOENT;
+	}
+
+	/* if we've got a direct item, and the buffer or page was uptodate,
+	 ** we don't want to pull data off disk again.  skip to the
+	 ** end, where we map the buffer and return
+	 */
+	if (buffer_uptodate(bh_result)) {
+		goto finished;
+	} else
+		/*
+		 ** grab_tail_page can trigger calls to reiserfs_get_block on up to date
+		 ** pages without any buffers.  If the page is up to date, we don't want
+		 ** read old data off disk.  Set the up to date bit on the buffer instead
+		 ** and jump to the end
+		 */
+	if (!bh_result->b_page || PageUptodate(bh_result->b_page)) {
 		set_buffer_uptodate(bh_result);
-		goto finished ;
-    }
-
-    // read file tail into part of page
-    offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ;
-    fs_gen = get_generation(inode->i_sb) ;
-    copy_item_head (&tmp_ih, ih);
-
-    /* we only want to kmap if we are reading the tail into the page.
-    ** this is not the common case, so we don't kmap until we are
-    ** sure we need to.  But, this means the item might move if
-    ** kmap schedules
-    */
-    if (!p) {
-	p = (char *)kmap(bh_result->b_page) ;
-	if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
-	    goto research;
-	}
-    }
-    p += offset ;
-    memset (p, 0, inode->i_sb->s_blocksize);
-    do {
-	if (!is_direct_le_ih (ih)) {
-	    BUG ();
-        }
-	/* make sure we don't read more bytes than actually exist in
-	** the file.  This can happen in odd cases where i_size isn't
-	** correct, and when direct item padding results in a few 
-	** extra bytes at the end of the direct item
-	*/
-        if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
-	    break ;
-	if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
-	    chars = inode->i_size - (le_ih_k_offset(ih) - 1) - path.pos_in_item;
-	    done = 1 ;
-	} else {
-	    chars = ih_item_len(ih) - path.pos_in_item;
-	}
-	memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars);
-
-	if (done) 
-	    break ;
-
-	p += chars;
-
-	if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1))
-	    // we done, if read direct item is not the last item of
-	    // node FIXME: we could try to check right delimiting key
-	    // to see whether direct item continues in the right
-	    // neighbor or rely on i_size
-	    break;
-
-	// update key to look for the next piece
-	set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars);
-	result = search_for_position_by_key (inode->i_sb, &key, &path);
-	if (result != POSITION_FOUND)
-	    // i/o error most likely
-	    break;
-	bh = get_last_bh (&path);
-	ih = get_ih (&path);
-    } while (1);
-
-    flush_dcache_page(bh_result->b_page) ;
-    kunmap(bh_result->b_page) ;
-
-finished:
-    pathrelse (&path);
-
-    if (result == IO_ERROR)
-	return -EIO;
-
-    /* this buffer has valid data, but isn't valid for io.  mapping it to
-     * block #0 tells the rest of reiserfs it just has a tail in it
-     */
-    map_bh(bh_result, inode->i_sb, 0);
-    set_buffer_uptodate (bh_result);
-    return 0;
-}
+		goto finished;
+	}
+	// read file tail into part of page
+	offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
+	fs_gen = get_generation(inode->i_sb);
+	copy_item_head(&tmp_ih, ih);
+
+	/* we only want to kmap if we are reading the tail into the page.
+	 ** this is not the common case, so we don't kmap until we are
+	 ** sure we need to.  But, this means the item might move if
+	 ** kmap schedules
+	 */
+	if (!p) {
+		p = (char *)kmap(bh_result->b_page);
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			goto research;
+		}
+	}
+	p += offset;
+	memset(p, 0, inode->i_sb->s_blocksize);
+	do {
+		if (!is_direct_le_ih(ih)) {
+			BUG();
+		}
+		/* make sure we don't read more bytes than actually exist in
+		 ** the file.  This can happen in odd cases where i_size isn't
+		 ** correct, and when direct item padding results in a few 
+		 ** extra bytes at the end of the direct item
+		 */
+		if ((le_ih_k_offset(ih) + path.pos_in_item) > inode->i_size)
+			break;
+		if ((le_ih_k_offset(ih) - 1 + ih_item_len(ih)) > inode->i_size) {
+			chars =
+			    inode->i_size - (le_ih_k_offset(ih) - 1) -
+			    path.pos_in_item;
+			done = 1;
+		} else {
+			chars = ih_item_len(ih) - path.pos_in_item;
+		}
+		memcpy(p, B_I_PITEM(bh, ih) + path.pos_in_item, chars);
+
+		if (done)
+			break;
+
+		p += chars;
+
+		if (PATH_LAST_POSITION(&path) != (B_NR_ITEMS(bh) - 1))
+			// we done, if read direct item is not the last item of
+			// node FIXME: we could try to check right delimiting key
+			// to see whether direct item continues in the right
+			// neighbor or rely on i_size
+			break;
+
+		// update key to look for the next piece
+		set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + chars);
+		result = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (result != POSITION_FOUND)
+			// i/o error most likely
+			break;
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+	} while (1);
+
+	flush_dcache_page(bh_result->b_page);
+	kunmap(bh_result->b_page);
+
+      finished:
+	pathrelse(&path);
+
+	if (result == IO_ERROR)
+		return -EIO;
 
+	/* this buffer has valid data, but isn't valid for io.  mapping it to
+	 * block #0 tells the rest of reiserfs it just has a tail in it
+	 */
+	map_bh(bh_result, inode->i_sb, 0);
+	set_buffer_uptodate(bh_result);
+	return 0;
+}
 
 // this is called to create file map. So, _get_block_create_0 will not
 // read direct item
-static int reiserfs_bmap (struct inode * inode, sector_t block,
-			  struct buffer_head * bh_result, int create)
+static int reiserfs_bmap(struct inode *inode, sector_t block,
+			 struct buffer_head *bh_result, int create)
 {
-    if (!file_capable (inode, block))
-	return -EFBIG;
-
-    reiserfs_write_lock(inode->i_sb);
-    /* do not read the direct item */
-    _get_block_create_0 (inode, block, bh_result, 0) ;
-    reiserfs_write_unlock(inode->i_sb);
-    return 0;
+	if (!file_capable(inode, block))
+		return -EFBIG;
+
+	reiserfs_write_lock(inode->i_sb);
+	/* do not read the direct item */
+	_get_block_create_0(inode, block, bh_result, 0);
+	reiserfs_write_unlock(inode->i_sb);
+	return 0;
 }
 
 /* special version of get_block that is only used by grab_tail_page right
@@ -444,9 +448,11 @@ static int reiserfs_bmap (struct inode * inode, sector_t block,
 ** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, 
 ** don't use this function.
 */
-static int reiserfs_get_block_create_0 (struct inode * inode, sector_t block,
-			struct buffer_head * bh_result, int create) {
-    return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ;
+static int reiserfs_get_block_create_0(struct inode *inode, sector_t block,
+				       struct buffer_head *bh_result,
+				       int create)
+{
+	return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE);
 }
 
 /* This is special helper for reiserfs_get_block in case we are executing
@@ -457,43 +463,42 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
 					 struct buffer_head *bh_result,
 					 int create)
 {
-    int ret ;
-
-    bh_result->b_page = NULL;
-
-    /* We set the b_size before reiserfs_get_block call since it is
-       referenced in convert_tail_for_hole() that may be called from
-       reiserfs_get_block() */
-    bh_result->b_size = (1 << inode->i_blkbits);
-
-    ret = reiserfs_get_block(inode, iblock, bh_result,
-                             create | GET_BLOCK_NO_DANGLE) ;
-    if (ret)
-        goto out;
-
-    /* don't allow direct io onto tail pages */
-    if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-        /* make sure future calls to the direct io funcs for this offset
-        ** in the file fail by unmapping the buffer
-        */
-        clear_buffer_mapped(bh_result);
-        ret = -EINVAL ;
-    }
-    /* Possible unpacked tail. Flush the data before pages have
-       disappeared */
-    if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
-        int err;
-        lock_kernel();
-        err = reiserfs_commit_for_inode(inode);
-        REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
-        unlock_kernel();
-        if (err < 0)
-            ret = err;
-    }
-out:
-    return ret ;
-}
+	int ret;
+
+	bh_result->b_page = NULL;
 
+	/* We set the b_size before reiserfs_get_block call since it is
+	   referenced in convert_tail_for_hole() that may be called from
+	   reiserfs_get_block() */
+	bh_result->b_size = (1 << inode->i_blkbits);
+
+	ret = reiserfs_get_block(inode, iblock, bh_result,
+				 create | GET_BLOCK_NO_DANGLE);
+	if (ret)
+		goto out;
+
+	/* don't allow direct io onto tail pages */
+	if (buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/* make sure future calls to the direct io funcs for this offset
+		 ** in the file fail by unmapping the buffer
+		 */
+		clear_buffer_mapped(bh_result);
+		ret = -EINVAL;
+	}
+	/* Possible unpacked tail. Flush the data before pages have
+	   disappeared */
+	if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
+		int err;
+		lock_kernel();
+		err = reiserfs_commit_for_inode(inode);
+		REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+		unlock_kernel();
+		if (err < 0)
+			ret = err;
+	}
+      out:
+	return ret;
+}
 
 /*
 ** helper function for when reiserfs_get_block is called for a hole
@@ -505,490 +510,547 @@ out:
 ** you should not be in a transaction, or have any paths held when you
 ** call this.
 */
-static int convert_tail_for_hole(struct inode *inode, 
-                                 struct buffer_head *bh_result,
-				 loff_t tail_offset) {
-    unsigned long index ;
-    unsigned long tail_end ; 
-    unsigned long tail_start ;
-    struct page * tail_page ;
-    struct page * hole_page = bh_result->b_page ;
-    int retval = 0 ;
-
-    if ((tail_offset & (bh_result->b_size - 1)) != 1) 
-        return -EIO ;
-
-    /* always try to read until the end of the block */
-    tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ;
-    tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ;
-
-    index = tail_offset >> PAGE_CACHE_SHIFT ;
-    /* hole_page can be zero in case of direct_io, we are sure
-       that we cannot get here if we write with O_DIRECT into
-       tail page */
-    if (!hole_page || index != hole_page->index) {
-	tail_page = grab_cache_page(inode->i_mapping, index) ;
-	retval = -ENOMEM;
-	if (!tail_page) {
-	    goto out ;
-	}
-    } else {
-        tail_page = hole_page ;
-    }
-
-    /* we don't have to make sure the conversion did not happen while
-    ** we were locking the page because anyone that could convert
-    ** must first take i_sem.
-    **
-    ** We must fix the tail page for writing because it might have buffers
-    ** that are mapped, but have a block number of 0.  This indicates tail
-    ** data that has been read directly into the page, and block_prepare_write
-    ** won't trigger a get_block in this case.
-    */
-    fix_tail_page_for_writing(tail_page) ;
-    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
-    if (retval)
-        goto unlock ;
-
-    /* tail conversion might change the data in the page */
-    flush_dcache_page(tail_page) ;
-
-    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
-
-unlock:
-    if (tail_page != hole_page) {
-        unlock_page(tail_page) ;
-	page_cache_release(tail_page) ;
-    }
-out:
-    return retval ;
+static int convert_tail_for_hole(struct inode *inode,
+				 struct buffer_head *bh_result,
+				 loff_t tail_offset)
+{
+	unsigned long index;
+	unsigned long tail_end;
+	unsigned long tail_start;
+	struct page *tail_page;
+	struct page *hole_page = bh_result->b_page;
+	int retval = 0;
+
+	if ((tail_offset & (bh_result->b_size - 1)) != 1)
+		return -EIO;
+
+	/* always try to read until the end of the block */
+	tail_start = tail_offset & (PAGE_CACHE_SIZE - 1);
+	tail_end = (tail_start | (bh_result->b_size - 1)) + 1;
+
+	index = tail_offset >> PAGE_CACHE_SHIFT;
+	/* hole_page can be zero in case of direct_io, we are sure
+	   that we cannot get here if we write with O_DIRECT into
+	   tail page */
+	if (!hole_page || index != hole_page->index) {
+		tail_page = grab_cache_page(inode->i_mapping, index);
+		retval = -ENOMEM;
+		if (!tail_page) {
+			goto out;
+		}
+	} else {
+		tail_page = hole_page;
+	}
+
+	/* we don't have to make sure the conversion did not happen while
+	 ** we were locking the page because anyone that could convert
+	 ** must first take i_sem.
+	 **
+	 ** We must fix the tail page for writing because it might have buffers
+	 ** that are mapped, but have a block number of 0.  This indicates tail
+	 ** data that has been read directly into the page, and block_prepare_write
+	 ** won't trigger a get_block in this case.
+	 */
+	fix_tail_page_for_writing(tail_page);
+	retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
+	if (retval)
+		goto unlock;
+
+	/* tail conversion might change the data in the page */
+	flush_dcache_page(tail_page);
+
+	retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end);
+
+      unlock:
+	if (tail_page != hole_page) {
+		unlock_page(tail_page);
+		page_cache_release(tail_page);
+	}
+      out:
+	return retval;
 }
 
 static inline int _allocate_block(struct reiserfs_transaction_handle *th,
-			   long block,
-                           struct inode *inode, 
-			   b_blocknr_t *allocated_block_nr, 
-			   struct path * path,
-			   int flags) {
-    BUG_ON (!th->t_trans_id);
-  
+				  long block,
+				  struct inode *inode,
+				  b_blocknr_t * allocated_block_nr,
+				  struct path *path, int flags)
+{
+	BUG_ON(!th->t_trans_id);
+
 #ifdef REISERFS_PREALLOCATE
-    if (!(flags & GET_BLOCK_NO_ISEM)) {
-	return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr, path, block);
-    }
+	if (!(flags & GET_BLOCK_NO_ISEM)) {
+		return reiserfs_new_unf_blocknrs2(th, inode, allocated_block_nr,
+						  path, block);
+	}
 #endif
-    return reiserfs_new_unf_blocknrs (th, inode, allocated_block_nr, path, block);
+	return reiserfs_new_unf_blocknrs(th, inode, allocated_block_nr, path,
+					 block);
 }
 
-int reiserfs_get_block (struct inode * inode, sector_t block,
-			struct buffer_head * bh_result, int create)
+int reiserfs_get_block(struct inode *inode, sector_t block,
+		       struct buffer_head *bh_result, int create)
 {
-    int repeat, retval = 0;
-    b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is (unsigned) 32 bit int
-    INITIALIZE_PATH(path);
-    int pos_in_item;
-    struct cpu_key key;
-    struct buffer_head * bh, * unbh = NULL;
-    struct item_head * ih, tmp_ih;
-    __le32 * item;
-    int done;
-    int fs_gen;
-    struct reiserfs_transaction_handle *th = NULL;
-    /* space reserved in transaction batch: 
-        . 3 balancings in direct->indirect conversion
-        . 1 block involved into reiserfs_update_sd()
-       XXX in practically impossible worst case direct2indirect()
-       can incur (much) more than 3 balancings.
-       quota update for user, group */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
-    int version;
-    int dangle = 1;
-    loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
-
-				/* bad.... */
-    reiserfs_write_lock(inode->i_sb);
-    version = get_inode_item_key_version (inode);
-
-    if (block < 0) {
-	reiserfs_write_unlock(inode->i_sb);
-	return -EIO;
-    }
+	int repeat, retval = 0;
+	b_blocknr_t allocated_block_nr = 0;	// b_blocknr_t is (unsigned) 32 bit int
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	struct cpu_key key;
+	struct buffer_head *bh, *unbh = NULL;
+	struct item_head *ih, tmp_ih;
+	__le32 *item;
+	int done;
+	int fs_gen;
+	struct reiserfs_transaction_handle *th = NULL;
+	/* space reserved in transaction batch: 
+	   . 3 balancings in direct->indirect conversion
+	   . 1 block involved into reiserfs_update_sd()
+	   XXX in practically impossible worst case direct2indirect()
+	   can incur (much) more than 3 balancings.
+	   quota update for user, group */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 1 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb);
+	int version;
+	int dangle = 1;
+	loff_t new_offset =
+	    (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
+
+	/* bad.... */
+	reiserfs_write_lock(inode->i_sb);
+	version = get_inode_item_key_version(inode);
 
-    if (!file_capable (inode, block)) {
-	reiserfs_write_unlock(inode->i_sb);
-	return -EFBIG;
-    }
-
-    /* if !create, we aren't changing the FS, so we don't need to
-    ** log anything, so we don't need to start a transaction
-    */
-    if (!(create & GET_BLOCK_CREATE)) {
-	int ret ;
-	/* find number of block-th logical block of the file */
-	ret = _get_block_create_0 (inode, block, bh_result, 
-	                           create | GET_BLOCK_READ_DIRECT) ;
-	reiserfs_write_unlock(inode->i_sb);
-	return ret;
-    }
-    /*
-     * if we're already in a transaction, make sure to close
-     * any new transactions we start in this func
-     */
-    if ((create & GET_BLOCK_NO_DANGLE) ||
-        reiserfs_transaction_running(inode->i_sb))
-        dangle = 0;
-
-    /* If file is of such a size, that it might have a tail and tails are enabled
-    ** we should mark it as possibly needing tail packing on close
-    */
-    if ( (have_large_tails (inode->i_sb) && inode->i_size < i_block_size (inode)*4) ||
-	 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
-	REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
-
-    /* set the key of the first byte in the 'block'-th block of file */
-    make_cpu_key (&key, inode, new_offset,
-		  TYPE_ANY, 3/*key length*/);
-    if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-start_trans:
-	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
-	if (!th) {
-	    retval = -ENOMEM;
-	    goto failure;
-	}
-	reiserfs_update_inode_transaction(inode) ;
-    }
- research:
-
-    retval = search_for_position_by_key (inode->i_sb, &key, &path);
-    if (retval == IO_ERROR) {
-	retval = -EIO;
-	goto failure;
-    }
-	
-    bh = get_last_bh (&path);
-    ih = get_ih (&path);
-    item = get_item (&path);
-    pos_in_item = path.pos_in_item;
-
-    fs_gen = get_generation (inode->i_sb);
-    copy_item_head (&tmp_ih, ih);
-
-    if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
-	/* we have to allocate block for the unformatted node */
-	if (!th) {
-	    pathrelse(&path) ;
-	    goto start_trans;
-	}
-
-	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
-
-	if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
-	    /* restart the transaction to give the journal a chance to free
-	    ** some blocks.  releases the path, so we have to go back to
-	    ** research if we succeed on the second try
-	    */
-	    SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
-	    retval = restart_transaction(th, inode, &path) ;
-            if (retval)
-                goto failure;
-	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
-
-	    if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
-		goto research ;
-	    }
-	    if (repeat == QUOTA_EXCEEDED)
-		retval = -EDQUOT;
-	    else
-		retval = -ENOSPC;
-	    goto failure;
-	}
-
-	if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
-	    goto research;
-	}
-    }
-
-    if (indirect_item_found (retval, ih)) {
-        b_blocknr_t unfm_ptr;
-	/* 'block'-th block is in the file already (there is
-	   corresponding cell in some indirect item). But it may be
-	   zero unformatted node pointer (hole) */
-        unfm_ptr = get_block_num (item, pos_in_item);
-	if (unfm_ptr == 0) {
-	    /* use allocated block to plug the hole */
-	    reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
-	    if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
-		reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
-		goto research;
-	    }
-	    set_buffer_new(bh_result);
-	    if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
-	    	reiserfs_add_ordered_list(inode, bh_result);
-	    put_block_num(item, pos_in_item, allocated_block_nr) ;
-            unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (th, inode->i_sb, bh);
-	    reiserfs_update_sd(th, inode) ;
-	}
-	set_block_dev_mapped(bh_result, unfm_ptr, inode);
-	pathrelse (&path);
-        retval = 0;
-	if (!dangle && th)
-	    retval = reiserfs_end_persistent_transaction(th);
+	if (block < 0) {
+		reiserfs_write_unlock(inode->i_sb);
+		return -EIO;
+	}
 
-	reiserfs_write_unlock(inode->i_sb);
-	 
-	/* the item was found, so new blocks were not added to the file
-	** there is no need to make sure the inode is updated with this 
-	** transaction
-	*/
-	return retval;
-    }
-
-    if (!th) {
-	pathrelse(&path) ;
-	goto start_trans;
-    }
-
-    /* desired position is not found or is in the direct item. We have
-       to append file with holes up to 'block'-th block converting
-       direct items to indirect one if necessary */
-    done = 0;
-    do {
-	if (is_statdata_le_ih (ih)) {
-	    __le32 unp = 0;
-	    struct cpu_key tmp_key;
-
-	    /* indirect item has to be inserted */
-	    make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT, 
-			       UNFM_P_SIZE, 0/* free_space */);
-
-	    if (cpu_key_k_offset (&key) == 1) {
-		/* we are going to add 'block'-th block to the file. Use
-		   allocated block for that */
-		unp = cpu_to_le32 (allocated_block_nr);
-		set_block_dev_mapped (bh_result, allocated_block_nr, inode);
-		set_buffer_new(bh_result);
-		done = 1;
-	    }
-	    tmp_key = key; // ;)
-	    set_cpu_key_k_offset (&tmp_key, 1);
-	    PATH_LAST_POSITION(&path) ++;
-
-	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp);
-	    if (retval) {
-		reiserfs_free_block (th, inode, allocated_block_nr, 1);
-		goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
-	    }
-	    //mark_tail_converted (inode);
-	} else if (is_direct_le_ih (ih)) {
-	    /* direct item has to be converted */
-	    loff_t tail_offset;
-
-	    tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
-	    if (tail_offset == cpu_key_k_offset (&key)) {
-		/* direct item we just found fits into block we have
-                   to map. Convert it into unformatted node: use
-                   bh_result for the conversion */
-		set_block_dev_mapped (bh_result, allocated_block_nr, inode);
-		unbh = bh_result;
-		done = 1;
-	    } else {
-		/* we have to padd file tail stored in direct item(s)
-		   up to block size and convert it to unformatted
-		   node. FIXME: this should also get into page cache */
-
-		pathrelse(&path) ;
-		/*
-		 * ugly, but we can only end the transaction if
-		 * we aren't nested
-		 */
-		BUG_ON (!th->t_refcount);
-		if (th->t_refcount == 1) {
-		    retval = reiserfs_end_persistent_transaction(th);
-		    th = NULL;
-		    if (retval)
+	if (!file_capable(inode, block)) {
+		reiserfs_write_unlock(inode->i_sb);
+		return -EFBIG;
+	}
+
+	/* if !create, we aren't changing the FS, so we don't need to
+	 ** log anything, so we don't need to start a transaction
+	 */
+	if (!(create & GET_BLOCK_CREATE)) {
+		int ret;
+		/* find number of block-th logical block of the file */
+		ret = _get_block_create_0(inode, block, bh_result,
+					  create | GET_BLOCK_READ_DIRECT);
+		reiserfs_write_unlock(inode->i_sb);
+		return ret;
+	}
+	/*
+	 * if we're already in a transaction, make sure to close
+	 * any new transactions we start in this func
+	 */
+	if ((create & GET_BLOCK_NO_DANGLE) ||
+	    reiserfs_transaction_running(inode->i_sb))
+		dangle = 0;
+
+	/* If file is of such a size, that it might have a tail and tails are enabled
+	 ** we should mark it as possibly needing tail packing on close
+	 */
+	if ((have_large_tails(inode->i_sb)
+	     && inode->i_size < i_block_size(inode) * 4)
+	    || (have_small_tails(inode->i_sb)
+		&& inode->i_size < i_block_size(inode)))
+		REISERFS_I(inode)->i_flags |= i_pack_on_close_mask;
+
+	/* set the key of the first byte in the 'block'-th block of file */
+	make_cpu_key(&key, inode, new_offset, TYPE_ANY, 3 /*key length */ );
+	if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
+	      start_trans:
+		th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+		if (!th) {
+			retval = -ENOMEM;
 			goto failure;
 		}
+		reiserfs_update_inode_transaction(inode);
+	}
+      research:
 
-		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
-		if (retval) {
-		    if ( retval != -ENOSPC )
-			reiserfs_warning (inode->i_sb, "clm-6004: convert tail failed inode %lu, error %d", inode->i_ino, retval) ;
-		    if (allocated_block_nr) {
-			/* the bitmap, the super, and the stat data == 3 */
-			if (!th)
-			    th = reiserfs_persistent_transaction(inode->i_sb,3);
-			if (th)
-			    reiserfs_free_block (th,inode,allocated_block_nr,1);
-		    }
-		    goto failure ;
-		}
-		goto research ;
-	    }
-	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
-	    if (retval) {
-		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (th, inode, allocated_block_nr, 1);
-		goto failure;
-	    }
-	    /* it is important the set_buffer_uptodate is done after
-	    ** the direct2indirect.  The buffer might contain valid
-	    ** data newer than the data on disk (read by readpage, changed,
-	    ** and then sent here by writepage).  direct2indirect needs
-	    ** to know if unbh was already up to date, so it can decide
-	    ** if the data in unbh needs to be replaced with data from
-	    ** the disk
-	    */
-	    set_buffer_uptodate (unbh);
-
-	    /* unbh->b_page == NULL in case of DIRECT_IO request, this means
-	       buffer will disappear shortly, so it should not be added to
-	     */
-	    if ( unbh->b_page ) {
-		/* we've converted the tail, so we must
-		** flush unbh before the transaction commits
-		*/
-		reiserfs_add_tail_list(inode, unbh) ;
-
-		/* mark it dirty now to prevent commit_write from adding
-		** this buffer to the inode's dirty buffer list
-		*/
-		/*
-		 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
-		 * It's still atomic, but it sets the page dirty too,
-		 * which makes it eligible for writeback at any time by the
-		 * VM (which was also the case with __mark_buffer_dirty())
-		 */
-		mark_buffer_dirty(unbh) ;
-	    }
-	} else {
-	    /* append indirect item with holes if needed, when appending
-	       pointer to 'block'-th block use block, which is already
-	       allocated */
-	    struct cpu_key tmp_key;
-	    unp_t unf_single=0; // We use this in case we need to allocate only
-				// one block which is a fastpath
-	    unp_t *un;
-	    __u64 max_to_insert=MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE;
-	    __u64 blocks_needed;
-
-	    RFALSE( pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
-		    "vs-804: invalid position for append");
-	    /* indirect item has to be appended, set up key of that position */
-	    make_cpu_key (&tmp_key, inode,
-			  le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize),
-			  //pos_in_item * inode->i_sb->s_blocksize,
-			  TYPE_INDIRECT, 3);// key type is unimportant
-
-	    blocks_needed = 1 + ((cpu_key_k_offset (&key) - cpu_key_k_offset (&tmp_key)) >> inode->i_sb->s_blocksize_bits);
-	    RFALSE( blocks_needed < 0, "green-805: invalid offset");
-
-	    if ( blocks_needed == 1 ) {
-		un = &unf_single;
-	    } else {
-		un=kmalloc( min(blocks_needed,max_to_insert)*UNFM_P_SIZE,
-			    GFP_ATOMIC); // We need to avoid scheduling.
-		if ( !un) {
-		    un = &unf_single;
-		    blocks_needed = 1;
-		    max_to_insert = 0;
-		} else
-		    memset(un, 0, UNFM_P_SIZE * min(blocks_needed,max_to_insert));
-	    }
-	    if ( blocks_needed <= max_to_insert) {
-		/* we are going to add target block to the file. Use allocated
-		   block for that */
-		un[blocks_needed-1] = cpu_to_le32 (allocated_block_nr);
-		set_block_dev_mapped (bh_result, allocated_block_nr, inode);
-		set_buffer_new(bh_result);
-		done = 1;
-	    } else {
-		/* paste hole to the indirect item */
-		/* If kmalloc failed, max_to_insert becomes zero and it means we
-		   only have space for one block */
-		blocks_needed=max_to_insert?max_to_insert:1;
-	    }
-	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed);
-
-	    if (blocks_needed != 1)
-		kfree(un);
-
-	    if (retval) {
-		reiserfs_free_block (th, inode, allocated_block_nr, 1);
-		goto failure;
-	    }
-	    if (!done) {
-		/* We need to mark new file size in case this function will be
-		   interrupted/aborted later on. And we may do this only for
-		   holes. */
-		inode->i_size += inode->i_sb->s_blocksize * blocks_needed;
-	    }
-	}
-
-	if (done == 1)
-	    break;
-
-	/* this loop could log more blocks than we had originally asked
-	** for.  So, we have to allow the transaction to end if it is
-	** too big or too full.  Update the inode so things are 
-	** consistent if we crash before the function returns
-	**
-	** release the path so that anybody waiting on the path before
-	** ending their transaction will be able to continue.
-	*/
-	if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-	  retval = restart_transaction(th, inode, &path) ;
-	  if (retval)
-	    goto failure;
-	}
-	/* inserting indirect pointers for a hole can take a 
-	** long time.  reschedule if needed
-	*/
-	cond_resched();
-
-	retval = search_for_position_by_key (inode->i_sb, &key, &path);
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
 	if (retval == IO_ERROR) {
-	    retval = -EIO;
-	    goto failure;
-	}
-	if (retval == POSITION_FOUND) {
-	    reiserfs_warning (inode->i_sb, "vs-825: reiserfs_get_block: "
-			      "%K should not be found", &key);
-	    retval = -EEXIST;
-	    if (allocated_block_nr)
-	        reiserfs_free_block (th, inode, allocated_block_nr, 1);
-	    pathrelse(&path) ;
-	    goto failure;
-	}
-	bh = get_last_bh (&path);
-	ih = get_ih (&path);
-	item = get_item (&path);
+		retval = -EIO;
+		goto failure;
+	}
+
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	item = get_item(&path);
 	pos_in_item = path.pos_in_item;
-    } while (1);
 
+	fs_gen = get_generation(inode->i_sb);
+	copy_item_head(&tmp_ih, ih);
+
+	if (allocation_needed
+	    (retval, allocated_block_nr, ih, item, pos_in_item)) {
+		/* we have to allocate block for the unformatted node */
+		if (!th) {
+			pathrelse(&path);
+			goto start_trans;
+		}
+
+		repeat =
+		    _allocate_block(th, block, inode, &allocated_block_nr,
+				    &path, create);
+
+		if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) {
+			/* restart the transaction to give the journal a chance to free
+			 ** some blocks.  releases the path, so we have to go back to
+			 ** research if we succeed on the second try
+			 */
+			SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+			repeat =
+			    _allocate_block(th, block, inode,
+					    &allocated_block_nr, NULL, create);
+
+			if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) {
+				goto research;
+			}
+			if (repeat == QUOTA_EXCEEDED)
+				retval = -EDQUOT;
+			else
+				retval = -ENOSPC;
+			goto failure;
+		}
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			goto research;
+		}
+	}
+
+	if (indirect_item_found(retval, ih)) {
+		b_blocknr_t unfm_ptr;
+		/* 'block'-th block is in the file already (there is
+		   corresponding cell in some indirect item). But it may be
+		   zero unformatted node pointer (hole) */
+		unfm_ptr = get_block_num(item, pos_in_item);
+		if (unfm_ptr == 0) {
+			/* use allocated block to plug the hole */
+			reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+			set_buffer_new(bh_result);
+			if (buffer_dirty(bh_result)
+			    && reiserfs_data_ordered(inode->i_sb))
+				reiserfs_add_ordered_list(inode, bh_result);
+			put_block_num(item, pos_in_item, allocated_block_nr);
+			unfm_ptr = allocated_block_nr;
+			journal_mark_dirty(th, inode->i_sb, bh);
+			reiserfs_update_sd(th, inode);
+		}
+		set_block_dev_mapped(bh_result, unfm_ptr, inode);
+		pathrelse(&path);
+		retval = 0;
+		if (!dangle && th)
+			retval = reiserfs_end_persistent_transaction(th);
+
+		reiserfs_write_unlock(inode->i_sb);
+
+		/* the item was found, so new blocks were not added to the file
+		 ** there is no need to make sure the inode is updated with this 
+		 ** transaction
+		 */
+		return retval;
+	}
+
+	if (!th) {
+		pathrelse(&path);
+		goto start_trans;
+	}
+
+	/* desired position is not found or is in the direct item. We have
+	   to append file with holes up to 'block'-th block converting
+	   direct items to indirect one if necessary */
+	done = 0;
+	do {
+		if (is_statdata_le_ih(ih)) {
+			__le32 unp = 0;
+			struct cpu_key tmp_key;
+
+			/* indirect item has to be inserted */
+			make_le_item_head(&tmp_ih, &key, version, 1,
+					  TYPE_INDIRECT, UNFM_P_SIZE,
+					  0 /* free_space */ );
+
+			if (cpu_key_k_offset(&key) == 1) {
+				/* we are going to add 'block'-th block to the file. Use
+				   allocated block for that */
+				unp = cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			}
+			tmp_key = key;	// ;)
+			set_cpu_key_k_offset(&tmp_key, 1);
+			PATH_LAST_POSITION(&path)++;
+
+			retval =
+			    reiserfs_insert_item(th, &path, &tmp_key, &tmp_ih,
+						 inode, (char *)&unp);
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;	// retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST
+			}
+			//mark_tail_converted (inode);
+		} else if (is_direct_le_ih(ih)) {
+			/* direct item has to be converted */
+			loff_t tail_offset;
+
+			tail_offset =
+			    ((le_ih_k_offset(ih) -
+			      1) & ~(inode->i_sb->s_blocksize - 1)) + 1;
+			if (tail_offset == cpu_key_k_offset(&key)) {
+				/* direct item we just found fits into block we have
+				   to map. Convert it into unformatted node: use
+				   bh_result for the conversion */
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				unbh = bh_result;
+				done = 1;
+			} else {
+				/* we have to padd file tail stored in direct item(s)
+				   up to block size and convert it to unformatted
+				   node. FIXME: this should also get into page cache */
+
+				pathrelse(&path);
+				/*
+				 * ugly, but we can only end the transaction if
+				 * we aren't nested
+				 */
+				BUG_ON(!th->t_refcount);
+				if (th->t_refcount == 1) {
+					retval =
+					    reiserfs_end_persistent_transaction
+					    (th);
+					th = NULL;
+					if (retval)
+						goto failure;
+				}
+
+				retval =
+				    convert_tail_for_hole(inode, bh_result,
+							  tail_offset);
+				if (retval) {
+					if (retval != -ENOSPC)
+						reiserfs_warning(inode->i_sb,
+								 "clm-6004: convert tail failed inode %lu, error %d",
+								 inode->i_ino,
+								 retval);
+					if (allocated_block_nr) {
+						/* the bitmap, the super, and the stat data == 3 */
+						if (!th)
+							th = reiserfs_persistent_transaction(inode->i_sb, 3);
+						if (th)
+							reiserfs_free_block(th,
+									    inode,
+									    allocated_block_nr,
+									    1);
+					}
+					goto failure;
+				}
+				goto research;
+			}
+			retval =
+			    direct2indirect(th, inode, &path, unbh,
+					    tail_offset);
+			if (retval) {
+				reiserfs_unmap_buffer(unbh);
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			/* it is important the set_buffer_uptodate is done after
+			 ** the direct2indirect.  The buffer might contain valid
+			 ** data newer than the data on disk (read by readpage, changed,
+			 ** and then sent here by writepage).  direct2indirect needs
+			 ** to know if unbh was already up to date, so it can decide
+			 ** if the data in unbh needs to be replaced with data from
+			 ** the disk
+			 */
+			set_buffer_uptodate(unbh);
+
+			/* unbh->b_page == NULL in case of DIRECT_IO request, this means
+			   buffer will disappear shortly, so it should not be added to
+			 */
+			if (unbh->b_page) {
+				/* we've converted the tail, so we must
+				 ** flush unbh before the transaction commits
+				 */
+				reiserfs_add_tail_list(inode, unbh);
+
+				/* mark it dirty now to prevent commit_write from adding
+				 ** this buffer to the inode's dirty buffer list
+				 */
+				/*
+				 * AKPM: changed __mark_buffer_dirty to mark_buffer_dirty().
+				 * It's still atomic, but it sets the page dirty too,
+				 * which makes it eligible for writeback at any time by the
+				 * VM (which was also the case with __mark_buffer_dirty())
+				 */
+				mark_buffer_dirty(unbh);
+			}
+		} else {
+			/* append indirect item with holes if needed, when appending
+			   pointer to 'block'-th block use block, which is already
+			   allocated */
+			struct cpu_key tmp_key;
+			unp_t unf_single = 0;	// We use this in case we need to allocate only
+			// one block which is a fastpath
+			unp_t *un;
+			__u64 max_to_insert =
+			    MAX_ITEM_LEN(inode->i_sb->s_blocksize) /
+			    UNFM_P_SIZE;
+			__u64 blocks_needed;
+
+			RFALSE(pos_in_item != ih_item_len(ih) / UNFM_P_SIZE,
+			       "vs-804: invalid position for append");
+			/* indirect item has to be appended, set up key of that position */
+			make_cpu_key(&tmp_key, inode,
+				     le_key_k_offset(version,
+						     &(ih->ih_key)) +
+				     op_bytes_number(ih,
+						     inode->i_sb->s_blocksize),
+				     //pos_in_item * inode->i_sb->s_blocksize,
+				     TYPE_INDIRECT, 3);	// key type is unimportant
+
+			blocks_needed =
+			    1 +
+			    ((cpu_key_k_offset(&key) -
+			      cpu_key_k_offset(&tmp_key)) >> inode->i_sb->
+			     s_blocksize_bits);
+			RFALSE(blocks_needed < 0, "green-805: invalid offset");
+
+			if (blocks_needed == 1) {
+				un = &unf_single;
+			} else {
+				un = kmalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC);	// We need to avoid scheduling.
+				if (!un) {
+					un = &unf_single;
+					blocks_needed = 1;
+					max_to_insert = 0;
+				} else
+					memset(un, 0,
+					       UNFM_P_SIZE * min(blocks_needed,
+								 max_to_insert));
+			}
+			if (blocks_needed <= max_to_insert) {
+				/* we are going to add target block to the file. Use allocated
+				   block for that */
+				un[blocks_needed - 1] =
+				    cpu_to_le32(allocated_block_nr);
+				set_block_dev_mapped(bh_result,
+						     allocated_block_nr, inode);
+				set_buffer_new(bh_result);
+				done = 1;
+			} else {
+				/* paste hole to the indirect item */
+				/* If kmalloc failed, max_to_insert becomes zero and it means we
+				   only have space for one block */
+				blocks_needed =
+				    max_to_insert ? max_to_insert : 1;
+			}
+			retval =
+			    reiserfs_paste_into_item(th, &path, &tmp_key, inode,
+						     (char *)un,
+						     UNFM_P_SIZE *
+						     blocks_needed);
+
+			if (blocks_needed != 1)
+				kfree(un);
+
+			if (retval) {
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+				goto failure;
+			}
+			if (!done) {
+				/* We need to mark new file size in case this function will be
+				   interrupted/aborted later on. And we may do this only for
+				   holes. */
+				inode->i_size +=
+				    inode->i_sb->s_blocksize * blocks_needed;
+			}
+		}
 
-    retval = 0;
+		if (done == 1)
+			break;
 
- failure:
-    if (th && (!dangle || (retval && !th->t_trans_id))) {
-        int err;
-        if (th->t_trans_id)
-            reiserfs_update_sd(th, inode);
-        err = reiserfs_end_persistent_transaction(th);
-        if (err)
-            retval = err;
-    }
+		/* this loop could log more blocks than we had originally asked
+		 ** for.  So, we have to allow the transaction to end if it is
+		 ** too big or too full.  Update the inode so things are 
+		 ** consistent if we crash before the function returns
+		 **
+		 ** release the path so that anybody waiting on the path before
+		 ** ending their transaction will be able to continue.
+		 */
+		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+			retval = restart_transaction(th, inode, &path);
+			if (retval)
+				goto failure;
+		}
+		/* inserting indirect pointers for a hole can take a 
+		 ** long time.  reschedule if needed
+		 */
+		cond_resched();
 
-    reiserfs_write_unlock(inode->i_sb);
-    reiserfs_check_path(&path) ;
-    return retval;
+		retval = search_for_position_by_key(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto failure;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb,
+					 "vs-825: reiserfs_get_block: "
+					 "%K should not be found", &key);
+			retval = -EEXIST;
+			if (allocated_block_nr)
+				reiserfs_free_block(th, inode,
+						    allocated_block_nr, 1);
+			pathrelse(&path);
+			goto failure;
+		}
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+		item = get_item(&path);
+		pos_in_item = path.pos_in_item;
+	} while (1);
+
+	retval = 0;
+
+      failure:
+	if (th && (!dangle || (retval && !th->t_trans_id))) {
+		int err;
+		if (th->t_trans_id)
+			reiserfs_update_sd(th, inode);
+		err = reiserfs_end_persistent_transaction(th);
+		if (err)
+			retval = err;
+	}
+
+	reiserfs_write_unlock(inode->i_sb);
+	reiserfs_check_path(&path);
+	return retval;
 }
 
 static int
 reiserfs_readpages(struct file *file, struct address_space *mapping,
-		struct list_head *pages, unsigned nr_pages)
+		   struct list_head *pages, unsigned nr_pages)
 {
-    return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
+	return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block);
 }
 
 /* Compute real number of used bytes by file
@@ -996,51 +1058,56 @@ reiserfs_readpages(struct file *file, struct address_space *mapping,
  */
 static int real_space_diff(struct inode *inode, int sd_size)
 {
-    int bytes;
-    loff_t blocksize = inode->i_sb->s_blocksize ;
-
-    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
-        return sd_size ;
-
-    /* End of file is also in full block with indirect reference, so round
-    ** up to the next block.
-    **
-    ** there is just no way to know if the tail is actually packed
-    ** on the file, so we have to assume it isn't.  When we pack the
-    ** tail, we add 4 bytes to pretend there really is an unformatted
-    ** node pointer
-    */
-    bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size;
-    return bytes ;
+	int bytes;
+	loff_t blocksize = inode->i_sb->s_blocksize;
+
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode))
+		return sd_size;
+
+	/* End of file is also in full block with indirect reference, so round
+	 ** up to the next block.
+	 **
+	 ** there is just no way to know if the tail is actually packed
+	 ** on the file, so we have to assume it isn't.  When we pack the
+	 ** tail, we add 4 bytes to pretend there really is an unformatted
+	 ** node pointer
+	 */
+	bytes =
+	    ((inode->i_size +
+	      (blocksize - 1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE +
+	    sd_size;
+	return bytes;
 }
 
 static inline loff_t to_real_used_space(struct inode *inode, ulong blocks,
-                                        int sd_size)
+					int sd_size)
 {
-    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-        return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ;
-    }
-    return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9);
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		return inode->i_size +
+		    (loff_t) (real_space_diff(inode, sd_size));
+	}
+	return ((loff_t) real_space_diff(inode, sd_size)) +
+	    (((loff_t) blocks) << 9);
 }
 
 /* Compute number of blocks used by file in ReiserFS counting */
 static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 {
-    loff_t bytes = inode_get_bytes(inode) ;
-    loff_t real_space = real_space_diff(inode, sd_size) ;
-
-    /* keeps fsck and non-quota versions of reiserfs happy */
-    if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
-        bytes += (loff_t)511 ;
-    }
-
-    /* files from before the quota patch might i_blocks such that
-    ** bytes < real_space.  Deal with that here to prevent it from
-    ** going negative.
-    */
-    if (bytes < real_space)
-        return 0 ;
-    return (bytes - real_space) >> 9;
+	loff_t bytes = inode_get_bytes(inode);
+	loff_t real_space = real_space_diff(inode, sd_size);
+
+	/* keeps fsck and non-quota versions of reiserfs happy */
+	if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) {
+		bytes += (loff_t) 511;
+	}
+
+	/* files from before the quota patch might i_blocks such that
+	 ** bytes < real_space.  Deal with that here to prevent it from
+	 ** going negative.
+	 */
+	if (bytes < real_space)
+		return 0;
+	return (bytes - real_space) >> 9;
 }
 
 //
@@ -1051,263 +1118,269 @@ static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size)
 //
 
 // called by read_locked_inode
-static void init_inode (struct inode * inode, struct path * path)
+static void init_inode(struct inode *inode, struct path *path)
 {
-    struct buffer_head * bh;
-    struct item_head * ih;
-    __u32 rdev;
-    //int version = ITEM_VERSION_1;
-
-    bh = PATH_PLAST_BUFFER (path);
-    ih = PATH_PITEM_HEAD (path);
-
-
-    copy_key (INODE_PKEY (inode), &(ih->ih_key));
-    inode->i_blksize = reiserfs_default_io_size;
-
-    INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
-    REISERFS_I(inode)->i_flags = 0;
-    REISERFS_I(inode)->i_prealloc_block = 0;
-    REISERFS_I(inode)->i_prealloc_count = 0;
-    REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_jl = NULL;
-    REISERFS_I(inode)->i_acl_access = NULL;
-    REISERFS_I(inode)->i_acl_default = NULL;
-    init_rwsem (&REISERFS_I(inode)->xattr_sem);
-
-    if (stat_data_v1 (ih)) {
-	struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
-	unsigned long blocks;
-
-	set_inode_item_key_version (inode, KEY_FORMAT_3_5);
-        set_inode_sd_version (inode, STAT_DATA_V1);
-	inode->i_mode  = sd_v1_mode(sd);
-	inode->i_nlink = sd_v1_nlink(sd);
-	inode->i_uid   = sd_v1_uid(sd);
-	inode->i_gid   = sd_v1_gid(sd);
-	inode->i_size  = sd_v1_size(sd);
-	inode->i_atime.tv_sec = sd_v1_atime(sd);
-	inode->i_mtime.tv_sec = sd_v1_mtime(sd);
-	inode->i_ctime.tv_sec = sd_v1_ctime(sd);
-	inode->i_atime.tv_nsec = 0;
-	inode->i_ctime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
-
-	inode->i_blocks = sd_v1_blocks(sd);
-	inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
-	blocks = (inode->i_size + 511) >> 9;
-	blocks = _ROUND_UP (blocks, inode->i_sb->s_blocksize >> 9);
-	if (inode->i_blocks > blocks) {
-	    // there was a bug in <=3.5.23 when i_blocks could take negative
-	    // values. Starting from 3.5.17 this value could even be stored in
-	    // stat data. For such files we set i_blocks based on file
-	    // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
-	    // only updated if file's inode will ever change
-	    inode->i_blocks = blocks;
-	}
-
-        rdev = sd_v1_rdev(sd);
-	REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd);
-	/* an early bug in the quota code can give us an odd number for the
-	** block count.  This is incorrect, fix it here.
-	*/
-	if (inode->i_blocks & 1) {
-	    inode->i_blocks++ ;
-	}
-	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
-	                                          SD_V1_SIZE));
-	/* nopack is initially zero for v1 objects. For v2 objects,
-	   nopack is initialised from sd_attrs */
-	REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
-    } else {
-	// new stat data found, but object may have old items
-	// (directories and symlinks)
-	struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih);
-
-	inode->i_mode   = sd_v2_mode(sd);
-	inode->i_nlink  = sd_v2_nlink(sd);
-	inode->i_uid    = sd_v2_uid(sd);
-	inode->i_size   = sd_v2_size(sd);
-	inode->i_gid    = sd_v2_gid(sd);
-	inode->i_mtime.tv_sec  = sd_v2_mtime(sd);
-	inode->i_atime.tv_sec = sd_v2_atime(sd);
-	inode->i_ctime.tv_sec  = sd_v2_ctime(sd);
-	inode->i_ctime.tv_nsec = 0;
-	inode->i_mtime.tv_nsec = 0;
-	inode->i_atime.tv_nsec = 0;
-	inode->i_blocks = sd_v2_blocks(sd);
-        rdev            = sd_v2_rdev(sd);
-	if( S_ISCHR( inode -> i_mode ) || S_ISBLK( inode -> i_mode ) )
-	    inode->i_generation = le32_to_cpu (INODE_PKEY (inode)->k_dir_id);
-	else
-            inode->i_generation = sd_v2_generation(sd);
+	struct buffer_head *bh;
+	struct item_head *ih;
+	__u32 rdev;
+	//int version = ITEM_VERSION_1;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = PATH_PITEM_HEAD(path);
+
+	copy_key(INODE_PKEY(inode), &(ih->ih_key));
+	inode->i_blksize = reiserfs_default_io_size;
+
+	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_default = NULL;
+	init_rwsem(&REISERFS_I(inode)->xattr_sem);
+
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd =
+		    (struct stat_data_v1 *)B_I_PITEM(bh, ih);
+		unsigned long blocks;
+
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		set_inode_sd_version(inode, STAT_DATA_V1);
+		inode->i_mode = sd_v1_mode(sd);
+		inode->i_nlink = sd_v1_nlink(sd);
+		inode->i_uid = sd_v1_uid(sd);
+		inode->i_gid = sd_v1_gid(sd);
+		inode->i_size = sd_v1_size(sd);
+		inode->i_atime.tv_sec = sd_v1_atime(sd);
+		inode->i_mtime.tv_sec = sd_v1_mtime(sd);
+		inode->i_ctime.tv_sec = sd_v1_ctime(sd);
+		inode->i_atime.tv_nsec = 0;
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+
+		inode->i_blocks = sd_v1_blocks(sd);
+		inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		blocks = (inode->i_size + 511) >> 9;
+		blocks = _ROUND_UP(blocks, inode->i_sb->s_blocksize >> 9);
+		if (inode->i_blocks > blocks) {
+			// there was a bug in <=3.5.23 when i_blocks could take negative
+			// values. Starting from 3.5.17 this value could even be stored in
+			// stat data. For such files we set i_blocks based on file
+			// size. Just 2 notes: this can be wrong for sparce files. On-disk value will be
+			// only updated if file's inode will ever change
+			inode->i_blocks = blocks;
+		}
 
-	if (S_ISDIR (inode->i_mode) || S_ISLNK (inode->i_mode))
-	    set_inode_item_key_version (inode, KEY_FORMAT_3_5);
-	else
-	    set_inode_item_key_version (inode, KEY_FORMAT_3_6);
-	REISERFS_I(inode)->i_first_direct_byte = 0;
-	set_inode_sd_version (inode, STAT_DATA_V2);
-	inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks,
-	                                          SD_V2_SIZE));
-	/* read persistent inode attributes from sd and initalise
-	   generic inode flags from them */
-	REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd );
-	sd_attrs_to_i_attrs( sd_v2_attrs( sd ), inode );
-    }
-
-    pathrelse (path);
-    if (S_ISREG (inode->i_mode)) {
-	inode->i_op = &reiserfs_file_inode_operations;
-	inode->i_fop = &reiserfs_file_operations;
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
-    } else if (S_ISDIR (inode->i_mode)) {
-	inode->i_op = &reiserfs_dir_inode_operations;
-	inode->i_fop = &reiserfs_dir_operations;
-    } else if (S_ISLNK (inode->i_mode)) {
-	inode->i_op = &reiserfs_symlink_inode_operations;
-	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-    } else {
-	inode->i_blocks = 0;
-	inode->i_op = &reiserfs_special_inode_operations;
-	init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
-    }
-}
+		rdev = sd_v1_rdev(sd);
+		REISERFS_I(inode)->i_first_direct_byte =
+		    sd_v1_first_direct_byte(sd);
+		/* an early bug in the quota code can give us an odd number for the
+		 ** block count.  This is incorrect, fix it here.
+		 */
+		if (inode->i_blocks & 1) {
+			inode->i_blocks++;
+		}
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V1_SIZE));
+		/* nopack is initially zero for v1 objects. For v2 objects,
+		   nopack is initialised from sd_attrs */
+		REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
+	} else {
+		// new stat data found, but object may have old items
+		// (directories and symlinks)
+		struct stat_data *sd = (struct stat_data *)B_I_PITEM(bh, ih);
+
+		inode->i_mode = sd_v2_mode(sd);
+		inode->i_nlink = sd_v2_nlink(sd);
+		inode->i_uid = sd_v2_uid(sd);
+		inode->i_size = sd_v2_size(sd);
+		inode->i_gid = sd_v2_gid(sd);
+		inode->i_mtime.tv_sec = sd_v2_mtime(sd);
+		inode->i_atime.tv_sec = sd_v2_atime(sd);
+		inode->i_ctime.tv_sec = sd_v2_ctime(sd);
+		inode->i_ctime.tv_nsec = 0;
+		inode->i_mtime.tv_nsec = 0;
+		inode->i_atime.tv_nsec = 0;
+		inode->i_blocks = sd_v2_blocks(sd);
+		rdev = sd_v2_rdev(sd);
+		if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+			inode->i_generation =
+			    le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+		else
+			inode->i_generation = sd_v2_generation(sd);
 
+		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+			set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+		else
+			set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+		REISERFS_I(inode)->i_first_direct_byte = 0;
+		set_inode_sd_version(inode, STAT_DATA_V2);
+		inode_set_bytes(inode,
+				to_real_used_space(inode, inode->i_blocks,
+						   SD_V2_SIZE));
+		/* read persistent inode attributes from sd and initalise
+		   generic inode flags from them */
+		REISERFS_I(inode)->i_attrs = sd_v2_attrs(sd);
+		sd_attrs_to_i_attrs(sd_v2_attrs(sd), inode);
+	}
+
+	pathrelse(path);
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &reiserfs_file_inode_operations;
+		inode->i_fop = &reiserfs_file_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &reiserfs_dir_inode_operations;
+		inode->i_fop = &reiserfs_dir_operations;
+	} else if (S_ISLNK(inode->i_mode)) {
+		inode->i_op = &reiserfs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+	} else {
+		inode->i_blocks = 0;
+		inode->i_op = &reiserfs_special_inode_operations;
+		init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+	}
+}
 
 // update new stat data with inode fields
-static void inode2sd (void * sd, struct inode * inode, loff_t size)
+static void inode2sd(void *sd, struct inode *inode, loff_t size)
 {
-    struct stat_data * sd_v2 = (struct stat_data *)sd;
-    __u16 flags;
-
-    set_sd_v2_mode(sd_v2, inode->i_mode );
-    set_sd_v2_nlink(sd_v2, inode->i_nlink );
-    set_sd_v2_uid(sd_v2, inode->i_uid );
-    set_sd_v2_size(sd_v2, size );
-    set_sd_v2_gid(sd_v2, inode->i_gid );
-    set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec );
-    set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec );
-    set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec );
-    set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
-    if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-	set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
-    else
-	set_sd_v2_generation(sd_v2, inode->i_generation);
-    flags = REISERFS_I(inode)->i_attrs;
-    i_attrs_to_sd_attrs( inode, &flags );
-    set_sd_v2_attrs( sd_v2, flags );
+	struct stat_data *sd_v2 = (struct stat_data *)sd;
+	__u16 flags;
+
+	set_sd_v2_mode(sd_v2, inode->i_mode);
+	set_sd_v2_nlink(sd_v2, inode->i_nlink);
+	set_sd_v2_uid(sd_v2, inode->i_uid);
+	set_sd_v2_size(sd_v2, size);
+	set_sd_v2_gid(sd_v2, inode->i_gid);
+	set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec);
+	set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec);
+	set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec);
+	set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE));
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v2_generation(sd_v2, inode->i_generation);
+	flags = REISERFS_I(inode)->i_attrs;
+	i_attrs_to_sd_attrs(inode, &flags);
+	set_sd_v2_attrs(sd_v2, flags);
 }
 
-
 // used to copy inode's fields to old stat data
-static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size)
+static void inode2sd_v1(void *sd, struct inode *inode, loff_t size)
 {
-    struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd;
-
-    set_sd_v1_mode(sd_v1, inode->i_mode );
-    set_sd_v1_uid(sd_v1, inode->i_uid );
-    set_sd_v1_gid(sd_v1, inode->i_gid );
-    set_sd_v1_nlink(sd_v1, inode->i_nlink );
-    set_sd_v1_size(sd_v1, size );
-    set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec );
-    set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec );
-    set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec );
-
-    if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-        set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
-    else
-        set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
-
-    // Sigh. i_first_direct_byte is back
-    set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte);
-}
+	struct stat_data_v1 *sd_v1 = (struct stat_data_v1 *)sd;
+
+	set_sd_v1_mode(sd_v1, inode->i_mode);
+	set_sd_v1_uid(sd_v1, inode->i_uid);
+	set_sd_v1_gid(sd_v1, inode->i_gid);
+	set_sd_v1_nlink(sd_v1, inode->i_nlink);
+	set_sd_v1_size(sd_v1, size);
+	set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec);
+	set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec);
+	set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev));
+	else
+		set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE));
 
+	// Sigh. i_first_direct_byte is back
+	set_sd_v1_first_direct_byte(sd_v1,
+				    REISERFS_I(inode)->i_first_direct_byte);
+}
 
 /* NOTE, you must prepare the buffer head before sending it here,
 ** and then log it after the call
 */
-static void update_stat_data (struct path * path, struct inode * inode,
-                              loff_t size)
+static void update_stat_data(struct path *path, struct inode *inode,
+			     loff_t size)
 {
-    struct buffer_head * bh;
-    struct item_head * ih;
-  
-    bh = PATH_PLAST_BUFFER (path);
-    ih = PATH_PITEM_HEAD (path);
-
-    if (!is_statdata_le_ih (ih))
-	reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h",
-			INODE_PKEY (inode), ih);
-  
-    if (stat_data_v1 (ih)) {
-	// path points to old stat data
-	inode2sd_v1 (B_I_PITEM (bh, ih), inode, size);
-    } else {
-	inode2sd (B_I_PITEM (bh, ih), inode, size);
-    }
-
-    return;
-}
+	struct buffer_head *bh;
+	struct item_head *ih;
+
+	bh = PATH_PLAST_BUFFER(path);
+	ih = PATH_PITEM_HEAD(path);
+
+	if (!is_statdata_le_ih(ih))
+		reiserfs_panic(inode->i_sb,
+			       "vs-13065: update_stat_data: key %k, found item %h",
+			       INODE_PKEY(inode), ih);
+
+	if (stat_data_v1(ih)) {
+		// path points to old stat data
+		inode2sd_v1(B_I_PITEM(bh, ih), inode, size);
+	} else {
+		inode2sd(B_I_PITEM(bh, ih), inode, size);
+	}
 
+	return;
+}
 
-void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
-			      struct inode * inode, loff_t size)
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+			     struct inode *inode, loff_t size)
 {
-    struct cpu_key key;
-    INITIALIZE_PATH(path);
-    struct buffer_head *bh ;
-    int fs_gen ;
-    struct item_head *ih, tmp_ih ;
-    int retval;
-
-    BUG_ON (!th->t_trans_id);
-
-    make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant
-    
-    for(;;) {
-	int pos;
-	/* look for the object's stat data */
-	retval = search_item (inode->i_sb, &key, &path);
-	if (retval == IO_ERROR) {
-	    reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: "
-			      "i/o failure occurred trying to update %K stat data",
-			      &key);
-	    return;
-	}
-	if (retval == ITEM_NOT_FOUND) {
-	    pos = PATH_LAST_POSITION (&path);
-	    pathrelse(&path) ;
-	    if (inode->i_nlink == 0) {
-		/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found");*/
-		return;
-	    }
-	    reiserfs_warning (inode->i_sb, "vs-13060: reiserfs_update_sd: "
-			      "stat data of object %k (nlink == %d) not found (pos %d)",
-			      INODE_PKEY (inode), inode->i_nlink, pos);
-	    reiserfs_check_path(&path) ;
-	    return;
-	}
-	
-	/* sigh, prepare_for_journal might schedule.  When it schedules the
-	** FS might change.  We have to detect that, and loop back to the
-	** search if the stat data item has moved
-	*/
-	bh = get_last_bh(&path) ;
-	ih = get_ih(&path) ;
-	copy_item_head (&tmp_ih, ih);
-	fs_gen = get_generation (inode->i_sb);
-	reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
-	if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
-	    reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
-	    continue ;	/* Stat_data item has been moved after scheduling. */
-	}
-	break;
-    }
-    update_stat_data (&path, inode, size);
-    journal_mark_dirty(th, th->t_super, bh) ; 
-    pathrelse (&path);
-    return;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct buffer_head *bh;
+	int fs_gen;
+	struct item_head *ih, tmp_ih;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	make_cpu_key(&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);	//key type is unimportant
+
+	for (;;) {
+		int pos;
+		/* look for the object's stat data */
+		retval = search_item(inode->i_sb, &key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_warning(inode->i_sb,
+					 "vs-13050: reiserfs_update_sd: "
+					 "i/o failure occurred trying to update %K stat data",
+					 &key);
+			return;
+		}
+		if (retval == ITEM_NOT_FOUND) {
+			pos = PATH_LAST_POSITION(&path);
+			pathrelse(&path);
+			if (inode->i_nlink == 0) {
+				/*reiserfs_warning (inode->i_sb, "vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found"); */
+				return;
+			}
+			reiserfs_warning(inode->i_sb,
+					 "vs-13060: reiserfs_update_sd: "
+					 "stat data of object %k (nlink == %d) not found (pos %d)",
+					 INODE_PKEY(inode), inode->i_nlink,
+					 pos);
+			reiserfs_check_path(&path);
+			return;
+		}
+
+		/* sigh, prepare_for_journal might schedule.  When it schedules the
+		 ** FS might change.  We have to detect that, and loop back to the
+		 ** search if the stat data item has moved
+		 */
+		bh = get_last_bh(&path);
+		ih = get_ih(&path);
+		copy_item_head(&tmp_ih, ih);
+		fs_gen = get_generation(inode->i_sb);
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			continue;	/* Stat_data item has been moved after scheduling. */
+		}
+		break;
+	}
+	update_stat_data(&path, inode, size);
+	journal_mark_dirty(th, th->t_super, bh);
+	pathrelse(&path);
+	return;
 }
 
 /* reiserfs_read_locked_inode is called to read the inode off disk, and it
@@ -1316,9 +1389,10 @@ void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
 ** corresponding iput might try to delete whatever object the inode last
 ** represented.
 */
-static void reiserfs_make_bad_inode(struct inode *inode) {
-    memset(INODE_PKEY(inode), 0, KEY_SIZE);
-    make_bad_inode(inode);
+static void reiserfs_make_bad_inode(struct inode *inode)
+{
+	memset(INODE_PKEY(inode), 0, KEY_SIZE);
+	make_bad_inode(inode);
 }
 
 //
@@ -1326,77 +1400,79 @@ static void reiserfs_make_bad_inode(struct inode *inode) {
 // evolved as the prototype did
 //
 
-int reiserfs_init_locked_inode (struct inode * inode, void *p)
+int reiserfs_init_locked_inode(struct inode *inode, void *p)
 {
-    struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p ;
-    inode->i_ino = args->objectid;
-    INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
-    return 0;
+	struct reiserfs_iget_args *args = (struct reiserfs_iget_args *)p;
+	inode->i_ino = args->objectid;
+	INODE_PKEY(inode)->k_dir_id = cpu_to_le32(args->dirid);
+	return 0;
 }
 
 /* looks for stat data in the tree, and fills up the fields of in-core
    inode stat data fields */
-void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args *args)
+void reiserfs_read_locked_inode(struct inode *inode,
+				struct reiserfs_iget_args *args)
 {
-    INITIALIZE_PATH (path_to_sd);
-    struct cpu_key key;
-    unsigned long dirino;
-    int retval;
-
-    dirino = args->dirid ;
-
-    /* set version 1, version 2 could be used too, because stat data
-       key is the same in both versions */
-    key.version = KEY_FORMAT_3_5;
-    key.on_disk_key.k_dir_id = dirino;
-    key.on_disk_key.k_objectid = inode->i_ino;
-    key.on_disk_key.k_offset = 0;
-    key.on_disk_key.k_type = 0;
-
-    /* look for the object's stat data */
-    retval = search_item (inode->i_sb, &key, &path_to_sd);
-    if (retval == IO_ERROR) {
-	reiserfs_warning (inode->i_sb, "vs-13070: reiserfs_read_locked_inode: "
-			  "i/o failure occurred trying to find stat data of %K",
-			  &key);
-	reiserfs_make_bad_inode(inode) ;
-	return;
-    }
-    if (retval != ITEM_FOUND) {
-	/* a stale NFS handle can trigger this without it being an error */
-	pathrelse (&path_to_sd);
-	reiserfs_make_bad_inode(inode) ;
-	inode->i_nlink = 0;
-	return;
-    }
-
-    init_inode (inode, &path_to_sd);
-   
-    /* It is possible that knfsd is trying to access inode of a file
-       that is being removed from the disk by some other thread. As we
-       update sd on unlink all that is required is to check for nlink
-       here. This bug was first found by Sizif when debugging
-       SquidNG/Butterfly, forgotten, and found again after Philippe
-       Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 
-
-       More logical fix would require changes in fs/inode.c:iput() to
-       remove inode from hash-table _after_ fs cleaned disk stuff up and
-       in iget() to return NULL if I_FREEING inode is found in
-       hash-table. */
-    /* Currently there is one place where it's ok to meet inode with
-       nlink==0: processing of open-unlinked and half-truncated files
-       during mount (fs/reiserfs/super.c:finish_unfinished()). */
-    if( ( inode -> i_nlink == 0 ) && 
-	! REISERFS_SB(inode -> i_sb) -> s_is_unlinked_ok ) {
-	    reiserfs_warning (inode->i_sb,
-			      "vs-13075: reiserfs_read_locked_inode: "
-			      "dead inode read from disk %K. "
-			      "This is likely to be race with knfsd. Ignore",
-			      &key );
-	    reiserfs_make_bad_inode( inode );
-    }
-
-    reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */
+	INITIALIZE_PATH(path_to_sd);
+	struct cpu_key key;
+	unsigned long dirino;
+	int retval;
+
+	dirino = args->dirid;
+
+	/* set version 1, version 2 could be used too, because stat data
+	   key is the same in both versions */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = dirino;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	key.on_disk_key.k_offset = 0;
+	key.on_disk_key.k_type = 0;
+
+	/* look for the object's stat data */
+	retval = search_item(inode->i_sb, &key, &path_to_sd);
+	if (retval == IO_ERROR) {
+		reiserfs_warning(inode->i_sb,
+				 "vs-13070: reiserfs_read_locked_inode: "
+				 "i/o failure occurred trying to find stat data of %K",
+				 &key);
+		reiserfs_make_bad_inode(inode);
+		return;
+	}
+	if (retval != ITEM_FOUND) {
+		/* a stale NFS handle can trigger this without it being an error */
+		pathrelse(&path_to_sd);
+		reiserfs_make_bad_inode(inode);
+		inode->i_nlink = 0;
+		return;
+	}
+
+	init_inode(inode, &path_to_sd);
+
+	/* It is possible that knfsd is trying to access inode of a file
+	   that is being removed from the disk by some other thread. As we
+	   update sd on unlink all that is required is to check for nlink
+	   here. This bug was first found by Sizif when debugging
+	   SquidNG/Butterfly, forgotten, and found again after Philippe
+	   Gramoulle <philippe.gramoulle@mmania.com> reproduced it. 
+
+	   More logical fix would require changes in fs/inode.c:iput() to
+	   remove inode from hash-table _after_ fs cleaned disk stuff up and
+	   in iget() to return NULL if I_FREEING inode is found in
+	   hash-table. */
+	/* Currently there is one place where it's ok to meet inode with
+	   nlink==0: processing of open-unlinked and half-truncated files
+	   during mount (fs/reiserfs/super.c:finish_unfinished()). */
+	if ((inode->i_nlink == 0) &&
+	    !REISERFS_SB(inode->i_sb)->s_is_unlinked_ok) {
+		reiserfs_warning(inode->i_sb,
+				 "vs-13075: reiserfs_read_locked_inode: "
+				 "dead inode read from disk %K. "
+				 "This is likely to be race with knfsd. Ignore",
+				 &key);
+		reiserfs_make_bad_inode(inode);
+	}
+
+	reiserfs_check_path(&path_to_sd);	/* init inode should be relsing */
 
 }
 
@@ -1412,140 +1488,148 @@ void reiserfs_read_locked_inode (struct inode * inode, struct reiserfs_iget_args
  * inode numbers (objectids) are distinguished by parent directory ids.
  *
  */
-int reiserfs_find_actor( struct inode *inode, void *opaque )
+int reiserfs_find_actor(struct inode *inode, void *opaque)
 {
-    struct reiserfs_iget_args *args;
+	struct reiserfs_iget_args *args;
 
-    args = opaque;
-    /* args is already in CPU order */
-    return (inode->i_ino == args->objectid) &&
-	(le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
+	args = opaque;
+	/* args is already in CPU order */
+	return (inode->i_ino == args->objectid) &&
+	    (le32_to_cpu(INODE_PKEY(inode)->k_dir_id) == args->dirid);
 }
 
-struct inode * reiserfs_iget (struct super_block * s, const struct cpu_key * key)
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key)
 {
-    struct inode * inode;
-    struct reiserfs_iget_args args ;
-
-    args.objectid = key->on_disk_key.k_objectid ;
-    args.dirid = key->on_disk_key.k_dir_id ;
-    inode = iget5_locked (s, key->on_disk_key.k_objectid, 
-		   reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
-    if (!inode) 
-	return ERR_PTR(-ENOMEM) ;
-
-    if (inode->i_state & I_NEW) {
-	reiserfs_read_locked_inode(inode, &args);
-	unlock_new_inode(inode);
-    }
-
-    if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) {
-	/* either due to i/o error or a stale NFS handle */
-	iput (inode);
-	inode = NULL;
-    }
-    return inode;
+	struct inode *inode;
+	struct reiserfs_iget_args args;
+
+	args.objectid = key->on_disk_key.k_objectid;
+	args.dirid = key->on_disk_key.k_dir_id;
+	inode = iget5_locked(s, key->on_disk_key.k_objectid,
+			     reiserfs_find_actor, reiserfs_init_locked_inode,
+			     (void *)(&args));
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
+
+	if (comp_short_keys(INODE_PKEY(inode), key) || is_bad_inode(inode)) {
+		/* either due to i/o error or a stale NFS handle */
+		iput(inode);
+		inode = NULL;
+	}
+	return inode;
 }
 
 struct dentry *reiserfs_get_dentry(struct super_block *sb, void *vobjp)
 {
-    __u32 *data = vobjp;
-    struct cpu_key key ;
-    struct dentry *result;
-    struct inode *inode;
-    
-    key.on_disk_key.k_objectid = data[0] ;
-    key.on_disk_key.k_dir_id = data[1] ;
-    reiserfs_write_lock(sb);
-    inode = reiserfs_iget(sb, &key) ;
-    if (inode && !IS_ERR(inode) && data[2] != 0 &&
-	data[2] != inode->i_generation) {
-	    iput(inode) ;
-	    inode = NULL ;
-    }
-    reiserfs_write_unlock(sb);
-    if (!inode)
-	    inode = ERR_PTR(-ESTALE);
-    if (IS_ERR(inode))
-	    return ERR_PTR(PTR_ERR(inode));
-    result = d_alloc_anon(inode);
-    if (!result) {
-	    iput(inode);
-	    return ERR_PTR(-ENOMEM);
-    }
-    return result;
+	__u32 *data = vobjp;
+	struct cpu_key key;
+	struct dentry *result;
+	struct inode *inode;
+
+	key.on_disk_key.k_objectid = data[0];
+	key.on_disk_key.k_dir_id = data[1];
+	reiserfs_write_lock(sb);
+	inode = reiserfs_iget(sb, &key);
+	if (inode && !IS_ERR(inode) && data[2] != 0 &&
+	    data[2] != inode->i_generation) {
+		iput(inode);
+		inode = NULL;
+	}
+	reiserfs_write_unlock(sb);
+	if (!inode)
+		inode = ERR_PTR(-ESTALE);
+	if (IS_ERR(inode))
+		return ERR_PTR(PTR_ERR(inode));
+	result = d_alloc_anon(inode);
+	if (!result) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	return result;
 }
 
-struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
-                                     int len, int fhtype,
-				  int (*acceptable)(void *contect, struct dentry *de),
-				  void *context) {
-    __u32 obj[3], parent[3];
-
-    /* fhtype happens to reflect the number of u32s encoded.
-     * due to a bug in earlier code, fhtype might indicate there
-     * are more u32s then actually fitted.
-     * so if fhtype seems to be more than len, reduce fhtype.
-     * Valid types are:
-     *   2 - objectid + dir_id - legacy support
-     *   3 - objectid + dir_id + generation
-     *   4 - objectid + dir_id + objectid and dirid of parent - legacy
-     *   5 - objectid + dir_id + generation + objectid and dirid of parent
-     *   6 - as above plus generation of directory
-     * 6 does not fit in NFSv2 handles
-     */
-    if (fhtype > len) {
-	    if (fhtype != 6 || len != 5)
-		    reiserfs_warning (sb, "nfsd/reiserfs, fhtype=%d, len=%d - odd",
-			   fhtype, len);
-	    fhtype = 5;
-    }
-
-    obj[0] = data[0];
-    obj[1] = data[1];
-    if (fhtype == 3 || fhtype >= 5)
-	    obj[2] = data[2];
-    else    obj[2] = 0; /* generation number */
-
-    if (fhtype >= 4) {
-	    parent[0] = data[fhtype>=5?3:2] ;
-	    parent[1] = data[fhtype>=5?4:3] ;
-	    if (fhtype == 6)
-		    parent[2] = data[5];
-	    else    parent[2] = 0;
-    }
-    return sb->s_export_op->find_exported_dentry(sb, obj, fhtype < 4 ? NULL : parent,
-			       acceptable, context);
-}
+struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
+				  int len, int fhtype,
+				  int (*acceptable) (void *contect,
+						     struct dentry * de),
+				  void *context)
+{
+	__u32 obj[3], parent[3];
+
+	/* fhtype happens to reflect the number of u32s encoded.
+	 * due to a bug in earlier code, fhtype might indicate there
+	 * are more u32s then actually fitted.
+	 * so if fhtype seems to be more than len, reduce fhtype.
+	 * Valid types are:
+	 *   2 - objectid + dir_id - legacy support
+	 *   3 - objectid + dir_id + generation
+	 *   4 - objectid + dir_id + objectid and dirid of parent - legacy
+	 *   5 - objectid + dir_id + generation + objectid and dirid of parent
+	 *   6 - as above plus generation of directory
+	 * 6 does not fit in NFSv2 handles
+	 */
+	if (fhtype > len) {
+		if (fhtype != 6 || len != 5)
+			reiserfs_warning(sb,
+					 "nfsd/reiserfs, fhtype=%d, len=%d - odd",
+					 fhtype, len);
+		fhtype = 5;
+	}
+
+	obj[0] = data[0];
+	obj[1] = data[1];
+	if (fhtype == 3 || fhtype >= 5)
+		obj[2] = data[2];
+	else
+		obj[2] = 0;	/* generation number */
 
-int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_parent) {
-    struct inode *inode = dentry->d_inode ;
-    int maxlen = *lenp;
-    
-    if (maxlen < 3)
-        return 255 ;
-
-    data[0] = inode->i_ino ;
-    data[1] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
-    data[2] = inode->i_generation ;
-    *lenp = 3 ;
-    /* no room for directory info? return what we've stored so far */
-    if (maxlen < 5 || ! need_parent)
-        return 3 ;
-
-    spin_lock(&dentry->d_lock);
-    inode = dentry->d_parent->d_inode ;
-    data[3] = inode->i_ino ;
-    data[4] = le32_to_cpu(INODE_PKEY (inode)->k_dir_id) ;
-    *lenp = 5 ;
-    if (maxlen >= 6) {
-	    data[5] = inode->i_generation ;
-	    *lenp = 6 ;
-    }
-    spin_unlock(&dentry->d_lock);
-    return *lenp ;
+	if (fhtype >= 4) {
+		parent[0] = data[fhtype >= 5 ? 3 : 2];
+		parent[1] = data[fhtype >= 5 ? 4 : 3];
+		if (fhtype == 6)
+			parent[2] = data[5];
+		else
+			parent[2] = 0;
+	}
+	return sb->s_export_op->find_exported_dentry(sb, obj,
+						     fhtype < 4 ? NULL : parent,
+						     acceptable, context);
 }
 
+int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
+		       int need_parent)
+{
+	struct inode *inode = dentry->d_inode;
+	int maxlen = *lenp;
+
+	if (maxlen < 3)
+		return 255;
+
+	data[0] = inode->i_ino;
+	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+	data[2] = inode->i_generation;
+	*lenp = 3;
+	/* no room for directory info? return what we've stored so far */
+	if (maxlen < 5 || !need_parent)
+		return 3;
+
+	spin_lock(&dentry->d_lock);
+	inode = dentry->d_parent->d_inode;
+	data[3] = inode->i_ino;
+	data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
+	*lenp = 5;
+	if (maxlen >= 6) {
+		data[5] = inode->i_generation;
+		*lenp = 6;
+	}
+	spin_unlock(&dentry->d_lock);
+	return *lenp;
+}
 
 /* looks for stat data, then copies fields to it, marks the buffer
    containing stat data as dirty */
@@ -1554,120 +1638,127 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 *data, int *lenp, int need_p
 ** to properly mark inodes for datasync and such, but only actually
 ** does something when called for a synchronous update.
 */
-int reiserfs_write_inode (struct inode * inode, int do_sync) {
-    struct reiserfs_transaction_handle th ;
-    int jbegin_count = 1 ;
-
-    if (inode->i_sb->s_flags & MS_RDONLY)
-        return -EROFS;
-    /* memory pressure can sometimes initiate write_inode calls with sync == 1,
-    ** these cases are just when the system needs ram, not when the 
-    ** inode needs to reach disk for safety, and they can safely be
-    ** ignored because the altered inode has already been logged.
-    */
-    if (do_sync && !(current->flags & PF_MEMALLOC)) {
-	reiserfs_write_lock(inode->i_sb);
-	if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
-            reiserfs_update_sd (&th, inode);
-            journal_end_sync(&th, inode->i_sb, jbegin_count) ;
-        }
-	reiserfs_write_unlock(inode->i_sb);
-    }
-    return 0;
+int reiserfs_write_inode(struct inode *inode, int do_sync)
+{
+	struct reiserfs_transaction_handle th;
+	int jbegin_count = 1;
+
+	if (inode->i_sb->s_flags & MS_RDONLY)
+		return -EROFS;
+	/* memory pressure can sometimes initiate write_inode calls with sync == 1,
+	 ** these cases are just when the system needs ram, not when the 
+	 ** inode needs to reach disk for safety, and they can safely be
+	 ** ignored because the altered inode has already been logged.
+	 */
+	if (do_sync && !(current->flags & PF_MEMALLOC)) {
+		reiserfs_write_lock(inode->i_sb);
+		if (!journal_begin(&th, inode->i_sb, jbegin_count)) {
+			reiserfs_update_sd(&th, inode);
+			journal_end_sync(&th, inode->i_sb, jbegin_count);
+		}
+		reiserfs_write_unlock(inode->i_sb);
+	}
+	return 0;
 }
 
 /* stat data of new object is inserted already, this inserts the item
    containing "." and ".." entries */
-static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, 
-				   struct inode *inode,
-				   struct item_head * ih, struct path * path,
-				   struct inode * dir)
+static int reiserfs_new_directory(struct reiserfs_transaction_handle *th,
+				  struct inode *inode,
+				  struct item_head *ih, struct path *path,
+				  struct inode *dir)
 {
-    struct super_block * sb = th->t_super;
-    char empty_dir [EMPTY_DIR_SIZE];
-    char * body = empty_dir;
-    struct cpu_key key;
-    int retval;
-
-    BUG_ON (!th->t_trans_id);
-    
-    _make_cpu_key (&key, KEY_FORMAT_3_5, le32_to_cpu (ih->ih_key.k_dir_id),
-		   le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/);
-    
-    /* compose item head for new item. Directories consist of items of
-       old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
-       is done by reiserfs_new_inode */
-    if (old_format_only (sb)) {
-	make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
-	
-	make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
-				INODE_PKEY (dir)->k_dir_id, 
-				INODE_PKEY (dir)->k_objectid );
-    } else {
-	make_le_item_head (ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
-	
-	make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid,
-		   		INODE_PKEY (dir)->k_dir_id, 
-		   		INODE_PKEY (dir)->k_objectid );
-    }
-    
-    /* look for place in the tree for new item */
-    retval = search_item (sb, &key, path);
-    if (retval == IO_ERROR) {
-	reiserfs_warning (sb, "vs-13080: reiserfs_new_directory: "
-			  "i/o failure occurred creating new directory");
-	return -EIO;
-    }
-    if (retval == ITEM_FOUND) {
-	pathrelse (path);
-	reiserfs_warning (sb, "vs-13070: reiserfs_new_directory: "
-			  "object with this key exists (%k)", &(ih->ih_key));
-	return -EEXIST;
-    }
-
-    /* insert item, that is empty directory item */
-    return reiserfs_insert_item (th, path, &key, ih, inode, body);
-}
+	struct super_block *sb = th->t_super;
+	char empty_dir[EMPTY_DIR_SIZE];
+	char *body = empty_dir;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5, le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid), DOT_OFFSET,
+		      TYPE_DIRENTRY, 3 /*key length */ );
+
+	/* compose item head for new item. Directories consist of items of
+	   old type (ITEM_VERSION_1). Do not set key (second arg is 0), it
+	   is done by reiserfs_new_inode */
+	if (old_format_only(sb)) {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2);
+
+		make_empty_dir_item_v1(body, ih->ih_key.k_dir_id,
+				       ih->ih_key.k_objectid,
+				       INODE_PKEY(dir)->k_dir_id,
+				       INODE_PKEY(dir)->k_objectid);
+	} else {
+		make_le_item_head(ih, NULL, KEY_FORMAT_3_5, DOT_OFFSET,
+				  TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2);
+
+		make_empty_dir_item(body, ih->ih_key.k_dir_id,
+				    ih->ih_key.k_objectid,
+				    INODE_PKEY(dir)->k_dir_id,
+				    INODE_PKEY(dir)->k_objectid);
+	}
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_warning(sb, "vs-13080: reiserfs_new_directory: "
+				 "i/o failure occurred creating new directory");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13070: reiserfs_new_directory: "
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
 
+	/* insert item, that is empty directory item */
+	return reiserfs_insert_item(th, path, &key, ih, inode, body);
+}
 
 /* stat data of object has been inserted, this inserts the item
    containing the body of symlink */
-static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, 
-				 struct inode *inode,	/* Inode of symlink */
-				 struct item_head * ih,
-				 struct path * path, const char * symname, int item_len)
+static int reiserfs_new_symlink(struct reiserfs_transaction_handle *th, struct inode *inode,	/* Inode of symlink */
+				struct item_head *ih,
+				struct path *path, const char *symname,
+				int item_len)
 {
-    struct super_block * sb = th->t_super;
-    struct cpu_key key;
-    int retval;
-
-    BUG_ON (!th->t_trans_id);
-
-    _make_cpu_key (&key, KEY_FORMAT_3_5, 
-		   le32_to_cpu (ih->ih_key.k_dir_id), 
-		   le32_to_cpu (ih->ih_key.k_objectid),
-		   1, TYPE_DIRECT, 3/*key length*/);
-
-    make_le_item_head (ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len, 0/*free_space*/);
-
-    /* look for place in the tree for new item */
-    retval = search_item (sb, &key, path);
-    if (retval == IO_ERROR) {
-	reiserfs_warning (sb, "vs-13080: reiserfs_new_symlinik: "
-			  "i/o failure occurred creating new symlink");
-	return -EIO;
-    }
-    if (retval == ITEM_FOUND) {
-	pathrelse (path);
-	reiserfs_warning (sb, "vs-13080: reiserfs_new_symlink: "
-			  "object with this key exists (%k)", &(ih->ih_key));
-	return -EEXIST;
-    }
-
-    /* insert item, that is body of symlink */
-    return reiserfs_insert_item (th, path, &key, ih, inode, symname);
-}
+	struct super_block *sb = th->t_super;
+	struct cpu_key key;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	_make_cpu_key(&key, KEY_FORMAT_3_5,
+		      le32_to_cpu(ih->ih_key.k_dir_id),
+		      le32_to_cpu(ih->ih_key.k_objectid),
+		      1, TYPE_DIRECT, 3 /*key length */ );
+
+	make_le_item_head(ih, NULL, KEY_FORMAT_3_5, 1, TYPE_DIRECT, item_len,
+			  0 /*free_space */ );
+
+	/* look for place in the tree for new item */
+	retval = search_item(sb, &key, path);
+	if (retval == IO_ERROR) {
+		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlinik: "
+				 "i/o failure occurred creating new symlink");
+		return -EIO;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(path);
+		reiserfs_warning(sb, "vs-13080: reiserfs_new_symlink: "
+				 "object with this key exists (%k)",
+				 &(ih->ih_key));
+		return -EEXIST;
+	}
 
+	/* insert item, that is body of symlink */
+	return reiserfs_insert_item(th, path, &key, ih, inode, symname);
+}
 
 /* inserts the stat data into the tree, and then calls
    reiserfs_new_directory (to insert ".", ".." item if new object is
@@ -1678,213 +1769,219 @@ static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th,
    non-zero due to an error, we have to drop the quota previously allocated
    for the fresh inode.  This can only be done outside a transaction, so
    if we return non-zero, we also end the transaction.  */
-int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
-			struct inode * dir, int mode, 
-			const char * symname, 
-                        /* 0 for regular, EMTRY_DIR_SIZE for dirs, 
-			   strlen (symname) for symlinks)*/
-		         loff_t i_size, struct dentry *dentry, 
-			 struct inode *inode)
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+		       struct inode *dir, int mode, const char *symname,
+		       /* 0 for regular, EMTRY_DIR_SIZE for dirs, 
+		          strlen (symname) for symlinks) */
+		       loff_t i_size, struct dentry *dentry,
+		       struct inode *inode)
 {
-    struct super_block * sb;
-    INITIALIZE_PATH (path_to_key);
-    struct cpu_key key;
-    struct item_head ih;
-    struct stat_data sd;
-    int retval;
-    int err;
-
-    BUG_ON (!th->t_trans_id);
-  
-    if (DQUOT_ALLOC_INODE(inode)) {
-	err = -EDQUOT;
-	goto out_end_trans;
-    }
-    if (!dir || !dir->i_nlink) {
-	err = -EPERM;
-	goto out_bad_inode;
-    }
-
-    sb = dir->i_sb;
-
-    /* item head of new item */
-    ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
-    ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
-    if (!ih.ih_key.k_objectid) {
-	err = -ENOMEM;
-	goto out_bad_inode ;
-    }
-    if (old_format_only (sb))
-	/* not a perfect generation count, as object ids can be reused, but 
-	** this is as good as reiserfs can do right now.
-	** note that the private part of inode isn't filled in yet, we have
-	** to use the directory.
-	*/
-	inode->i_generation = le32_to_cpu (INODE_PKEY (dir)->k_objectid);
-    else
+	struct super_block *sb;
+	INITIALIZE_PATH(path_to_key);
+	struct cpu_key key;
+	struct item_head ih;
+	struct stat_data sd;
+	int retval;
+	int err;
+
+	BUG_ON(!th->t_trans_id);
+
+	if (DQUOT_ALLOC_INODE(inode)) {
+		err = -EDQUOT;
+		goto out_end_trans;
+	}
+	if (!dir || !dir->i_nlink) {
+		err = -EPERM;
+		goto out_bad_inode;
+	}
+
+	sb = dir->i_sb;
+
+	/* item head of new item */
+	ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
+	ih.ih_key.k_objectid = cpu_to_le32(reiserfs_get_unused_objectid(th));
+	if (!ih.ih_key.k_objectid) {
+		err = -ENOMEM;
+		goto out_bad_inode;
+	}
+	if (old_format_only(sb))
+		/* not a perfect generation count, as object ids can be reused, but 
+		 ** this is as good as reiserfs can do right now.
+		 ** note that the private part of inode isn't filled in yet, we have
+		 ** to use the directory.
+		 */
+		inode->i_generation = le32_to_cpu(INODE_PKEY(dir)->k_objectid);
+	else
 #if defined( USE_INODE_GENERATION_COUNTER )
-	inode->i_generation = le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
+		inode->i_generation =
+		    le32_to_cpu(REISERFS_SB(sb)->s_rs->s_inode_generation);
 #else
-	inode->i_generation = ++event;
+		inode->i_generation = ++event;
 #endif
 
-    /* fill stat data */
-    inode->i_nlink = (S_ISDIR (mode) ? 2 : 1);
-
-    /* uid and gid must already be set by the caller for quota init */
-
-    /* symlink cannot be immutable or append only, right? */
-    if( S_ISLNK( inode -> i_mode ) )
-	    inode -> i_flags &= ~ ( S_IMMUTABLE | S_APPEND );
-
-    inode->i_mtime = inode->i_atime = inode->i_ctime =
-	    CURRENT_TIME_SEC;
-    inode->i_size = i_size;
-    inode->i_blocks = 0;
-    inode->i_bytes = 0;
-    REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : 
-      U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/;
-
-    INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list ));
-    REISERFS_I(inode)->i_flags = 0;
-    REISERFS_I(inode)->i_prealloc_block = 0;
-    REISERFS_I(inode)->i_prealloc_count = 0;
-    REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_jl = NULL;
-    REISERFS_I(inode)->i_attrs =
-	REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
-    sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
-    REISERFS_I(inode)->i_acl_access = NULL;
-    REISERFS_I(inode)->i_acl_default = NULL;
-    init_rwsem (&REISERFS_I(inode)->xattr_sem);
-
-    if (old_format_only (sb))
-	make_le_item_head (&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
-    else
-	make_le_item_head (&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-
-    /* key to search for correct place for new stat data */
-    _make_cpu_key (&key, KEY_FORMAT_3_6, le32_to_cpu (ih.ih_key.k_dir_id),
-		   le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/);
-
-    /* find proper place for inserting of stat data */
-    retval = search_item (sb, &key, &path_to_key);
-    if (retval == IO_ERROR) {
-	err = -EIO;
-	goto out_bad_inode;
-    }
-    if (retval == ITEM_FOUND) {
-	pathrelse (&path_to_key);
-	err = -EEXIST;
-	goto out_bad_inode;
-    }
-    if (old_format_only (sb)) {
-	if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
-	    pathrelse (&path_to_key);
-	    /* i_uid or i_gid is too big to be stored in stat data v3.5 */
-	    err = -EINVAL;
-	    goto out_bad_inode;
-	}
-	inode2sd_v1 (&sd, inode, inode->i_size);
-    } else {
-	inode2sd (&sd, inode, inode->i_size);
-    }
-    // these do not go to on-disk stat data
-    inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid);
-    inode->i_blksize = reiserfs_default_io_size;
-  
-    // store in in-core inode the key of stat data and version all
-    // object items will have (directory items will have old offset
-    // format, other new objects will consist of new items)
-    memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE);
-    if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode))
-        set_inode_item_key_version (inode, KEY_FORMAT_3_5);
-    else
-        set_inode_item_key_version (inode, KEY_FORMAT_3_6);
-    if (old_format_only (sb))
-	set_inode_sd_version (inode, STAT_DATA_V1);
-    else
-	set_inode_sd_version (inode, STAT_DATA_V2);
-    
-    /* insert the stat data into the tree */
+	/* fill stat data */
+	inode->i_nlink = (S_ISDIR(mode) ? 2 : 1);
+
+	/* uid and gid must already be set by the caller for quota init */
+
+	/* symlink cannot be immutable or append only, right? */
+	if (S_ISLNK(inode->i_mode))
+		inode->i_flags &= ~(S_IMMUTABLE | S_APPEND);
+
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_size = i_size;
+	inode->i_blocks = 0;
+	inode->i_bytes = 0;
+	REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 :
+	    U32_MAX /*NO_BYTES_IN_DIRECT_ITEM */ ;
+
+	INIT_LIST_HEAD(&(REISERFS_I(inode)->i_prealloc_list));
+	REISERFS_I(inode)->i_flags = 0;
+	REISERFS_I(inode)->i_prealloc_block = 0;
+	REISERFS_I(inode)->i_prealloc_count = 0;
+	REISERFS_I(inode)->i_trans_id = 0;
+	REISERFS_I(inode)->i_jl = NULL;
+	REISERFS_I(inode)->i_attrs =
+	    REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
+	sd_attrs_to_i_attrs(REISERFS_I(inode)->i_attrs, inode);
+	REISERFS_I(inode)->i_acl_access = NULL;
+	REISERFS_I(inode)->i_acl_default = NULL;
+	init_rwsem(&REISERFS_I(inode)->xattr_sem);
+
+	if (old_format_only(sb))
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+	else
+		make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+				  TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+
+	/* key to search for correct place for new stat data */
+	_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
+		      le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
+		      TYPE_STAT_DATA, 3 /*key length */ );
+
+	/* find proper place for inserting of stat data */
+	retval = search_item(sb, &key, &path_to_key);
+	if (retval == IO_ERROR) {
+		err = -EIO;
+		goto out_bad_inode;
+	}
+	if (retval == ITEM_FOUND) {
+		pathrelse(&path_to_key);
+		err = -EEXIST;
+		goto out_bad_inode;
+	}
+	if (old_format_only(sb)) {
+		if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
+			pathrelse(&path_to_key);
+			/* i_uid or i_gid is too big to be stored in stat data v3.5 */
+			err = -EINVAL;
+			goto out_bad_inode;
+		}
+		inode2sd_v1(&sd, inode, inode->i_size);
+	} else {
+		inode2sd(&sd, inode, inode->i_size);
+	}
+	// these do not go to on-disk stat data
+	inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+	inode->i_blksize = reiserfs_default_io_size;
+
+	// store in in-core inode the key of stat data and version all
+	// object items will have (directory items will have old offset
+	// format, other new objects will consist of new items)
+	memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+	if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
+		set_inode_item_key_version(inode, KEY_FORMAT_3_5);
+	else
+		set_inode_item_key_version(inode, KEY_FORMAT_3_6);
+	if (old_format_only(sb))
+		set_inode_sd_version(inode, STAT_DATA_V1);
+	else
+		set_inode_sd_version(inode, STAT_DATA_V2);
+
+	/* insert the stat data into the tree */
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    if (REISERFS_I(dir)->new_packing_locality)
-	th->displace_new_blocks = 1;
+	if (REISERFS_I(dir)->new_packing_locality)
+		th->displace_new_blocks = 1;
 #endif
-    retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd));
-    if (retval) {
-	err = retval;
-	reiserfs_check_path(&path_to_key) ;
-	goto out_bad_inode;
-    }
-
+	retval =
+	    reiserfs_insert_item(th, &path_to_key, &key, &ih, inode,
+				 (char *)(&sd));
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		goto out_bad_inode;
+	}
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    if (!th->displace_new_blocks)
-	REISERFS_I(dir)->new_packing_locality = 0;
+	if (!th->displace_new_blocks)
+		REISERFS_I(dir)->new_packing_locality = 0;
 #endif
-    if (S_ISDIR(mode)) {
-	/* insert item with "." and ".." */
-	retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir);
-    }
-
-    if (S_ISLNK(mode)) {
-	/* insert body of symlink */
-	if (!old_format_only (sb))
-	    i_size = ROUND_UP(i_size);
-	retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size);
-    }
-    if (retval) {
-	err = retval;
-	reiserfs_check_path(&path_to_key) ;
-	journal_end(th, th->t_super, th->t_blocks_allocated);
-	goto out_inserted_sd;
-    }
-
-    /* XXX CHECK THIS */
-    if (reiserfs_posixacl (inode->i_sb)) {
-        retval = reiserfs_inherit_default_acl (dir, dentry, inode);
-        if (retval) {
-            err = retval;
-            reiserfs_check_path(&path_to_key) ;
-            journal_end(th, th->t_super, th->t_blocks_allocated);
-            goto out_inserted_sd;
-        }
-    } else if (inode->i_sb->s_flags & MS_POSIXACL) {
-	reiserfs_warning (inode->i_sb, "ACLs aren't enabled in the fs, "
-			  "but vfs thinks they are!");
-    } else if (is_reiserfs_priv_object (dir)) {
-	reiserfs_mark_inode_private (inode);
-    }
-
-    insert_inode_hash (inode);
-    reiserfs_update_sd(th, inode);
-    reiserfs_check_path(&path_to_key) ;
-
-    return 0;
+	if (S_ISDIR(mode)) {
+		/* insert item with "." and ".." */
+		retval =
+		    reiserfs_new_directory(th, inode, &ih, &path_to_key, dir);
+	}
+
+	if (S_ISLNK(mode)) {
+		/* insert body of symlink */
+		if (!old_format_only(sb))
+			i_size = ROUND_UP(i_size);
+		retval =
+		    reiserfs_new_symlink(th, inode, &ih, &path_to_key, symname,
+					 i_size);
+	}
+	if (retval) {
+		err = retval;
+		reiserfs_check_path(&path_to_key);
+		journal_end(th, th->t_super, th->t_blocks_allocated);
+		goto out_inserted_sd;
+	}
+
+	/* XXX CHECK THIS */
+	if (reiserfs_posixacl(inode->i_sb)) {
+		retval = reiserfs_inherit_default_acl(dir, dentry, inode);
+		if (retval) {
+			err = retval;
+			reiserfs_check_path(&path_to_key);
+			journal_end(th, th->t_super, th->t_blocks_allocated);
+			goto out_inserted_sd;
+		}
+	} else if (inode->i_sb->s_flags & MS_POSIXACL) {
+		reiserfs_warning(inode->i_sb, "ACLs aren't enabled in the fs, "
+				 "but vfs thinks they are!");
+	} else if (is_reiserfs_priv_object(dir)) {
+		reiserfs_mark_inode_private(inode);
+	}
+
+	insert_inode_hash(inode);
+	reiserfs_update_sd(th, inode);
+	reiserfs_check_path(&path_to_key);
+
+	return 0;
 
 /* it looks like you can easily compress these two goto targets into
  * one.  Keeping it like this doesn't actually hurt anything, and they
  * are place holders for what the quota code actually needs.
  */
-out_bad_inode:
-    /* Invalidate the object, nothing was inserted yet */
-    INODE_PKEY(inode)->k_objectid = 0;
-
-    /* Quota change must be inside a transaction for journaling */
-    DQUOT_FREE_INODE(inode);
-
-out_end_trans:
-    journal_end(th, th->t_super, th->t_blocks_allocated) ;
-    /* Drop can be outside and it needs more credits so it's better to have it outside */
-    DQUOT_DROP(inode);
-    inode->i_flags |= S_NOQUOTA;
-    make_bad_inode(inode);
-
-out_inserted_sd:
-    inode->i_nlink = 0;
-    th->t_trans_id = 0; /* so the caller can't use this handle later */
-    iput(inode);
-    return err;
+      out_bad_inode:
+	/* Invalidate the object, nothing was inserted yet */
+	INODE_PKEY(inode)->k_objectid = 0;
+
+	/* Quota change must be inside a transaction for journaling */
+	DQUOT_FREE_INODE(inode);
+
+      out_end_trans:
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	/* Drop can be outside and it needs more credits so it's better to have it outside */
+	DQUOT_DROP(inode);
+	inode->i_flags |= S_NOQUOTA;
+	make_bad_inode(inode);
+
+      out_inserted_sd:
+	inode->i_nlink = 0;
+	th->t_trans_id = 0;	/* so the caller can't use this handle later */
+	iput(inode);
+	return err;
 }
 
 /*
@@ -1900,77 +1997,78 @@ out_inserted_sd:
 **
 ** on failure, nonzero is returned, page_result and bh_result are untouched.
 */
-static int grab_tail_page(struct inode *p_s_inode, 
-			  struct page **page_result, 
-			  struct buffer_head **bh_result) {
-
-    /* we want the page with the last byte in the file,
-    ** not the page that will hold the next byte for appending
-    */
-    unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ;
-    unsigned long pos = 0 ;
-    unsigned long start = 0 ;
-    unsigned long blocksize = p_s_inode->i_sb->s_blocksize ;
-    unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ;
-    struct buffer_head *bh ;
-    struct buffer_head *head ;
-    struct page * page ;
-    int error ;
-    
-    /* we know that we are only called with inode->i_size > 0.
-    ** we also know that a file tail can never be as big as a block
-    ** If i_size % blocksize == 0, our file is currently block aligned
-    ** and it won't need converting or zeroing after a truncate.
-    */
-    if ((offset & (blocksize - 1)) == 0) {
-        return -ENOENT ;
-    }
-    page = grab_cache_page(p_s_inode->i_mapping, index) ;
-    error = -ENOMEM ;
-    if (!page) {
-        goto out ;
-    }
-    /* start within the page of the last block in the file */
-    start = (offset / blocksize) * blocksize ;
-
-    error = block_prepare_write(page, start, offset, 
-				reiserfs_get_block_create_0) ;
-    if (error)
-	goto unlock ;
-
-    head = page_buffers(page) ;      
-    bh = head;
-    do {
-	if (pos >= start) {
-	    break ;
-	}
-	bh = bh->b_this_page ;
-	pos += blocksize ;
-    } while(bh != head) ;
-
-    if (!buffer_uptodate(bh)) {
-	/* note, this should never happen, prepare_write should
-	** be taking care of this for us.  If the buffer isn't up to date,
-	** I've screwed up the code to find the buffer, or the code to
-	** call prepare_write
-	*/
-	reiserfs_warning (p_s_inode->i_sb,
-			  "clm-6000: error reading block %lu on dev %s",
-			  bh->b_blocknr,
-			  reiserfs_bdevname (p_s_inode->i_sb)) ;
-	error = -EIO ;
-	goto unlock ;
-    }
-    *bh_result = bh ;
-    *page_result = page ;
-
-out:
-    return error ;
-
-unlock:
-    unlock_page(page) ;
-    page_cache_release(page) ;
-    return error ;
+static int grab_tail_page(struct inode *p_s_inode,
+			  struct page **page_result,
+			  struct buffer_head **bh_result)
+{
+
+	/* we want the page with the last byte in the file,
+	 ** not the page that will hold the next byte for appending
+	 */
+	unsigned long index = (p_s_inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+	unsigned long pos = 0;
+	unsigned long start = 0;
+	unsigned long blocksize = p_s_inode->i_sb->s_blocksize;
+	unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1);
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct page *page;
+	int error;
+
+	/* we know that we are only called with inode->i_size > 0.
+	 ** we also know that a file tail can never be as big as a block
+	 ** If i_size % blocksize == 0, our file is currently block aligned
+	 ** and it won't need converting or zeroing after a truncate.
+	 */
+	if ((offset & (blocksize - 1)) == 0) {
+		return -ENOENT;
+	}
+	page = grab_cache_page(p_s_inode->i_mapping, index);
+	error = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	/* start within the page of the last block in the file */
+	start = (offset / blocksize) * blocksize;
+
+	error = block_prepare_write(page, start, offset,
+				    reiserfs_get_block_create_0);
+	if (error)
+		goto unlock;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (pos >= start) {
+			break;
+		}
+		bh = bh->b_this_page;
+		pos += blocksize;
+	} while (bh != head);
+
+	if (!buffer_uptodate(bh)) {
+		/* note, this should never happen, prepare_write should
+		 ** be taking care of this for us.  If the buffer isn't up to date,
+		 ** I've screwed up the code to find the buffer, or the code to
+		 ** call prepare_write
+		 */
+		reiserfs_warning(p_s_inode->i_sb,
+				 "clm-6000: error reading block %lu on dev %s",
+				 bh->b_blocknr,
+				 reiserfs_bdevname(p_s_inode->i_sb));
+		error = -EIO;
+		goto unlock;
+	}
+	*bh_result = bh;
+	*page_result = page;
+
+      out:
+	return error;
+
+      unlock:
+	unlock_page(page);
+	page_cache_release(page);
+	return error;
 }
 
 /*
@@ -1979,235 +2077,247 @@ unlock:
 **
 ** some code taken from block_truncate_page
 */
-int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
-    struct reiserfs_transaction_handle th ;
-    /* we want the offset for the first byte after the end of the file */
-    unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
-    unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
-    unsigned length ;
-    struct page *page = NULL ;
-    int error ;
-    struct buffer_head *bh = NULL ;
-
-    reiserfs_write_lock(p_s_inode->i_sb);
-
-    if (p_s_inode->i_size > 0) {
-        if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
-	    // -ENOENT means we truncated past the end of the file, 
-	    // and get_block_create_0 could not find a block to read in,
-	    // which is ok.
-	    if (error != -ENOENT)
-	        reiserfs_warning (p_s_inode->i_sb,
-				  "clm-6001: grab_tail_page failed %d",
-				  error);
-	    page = NULL ;
-	    bh = NULL ;
-	}
-    }
-
-    /* so, if page != NULL, we have a buffer head for the offset at 
-    ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 
-    ** then we have an unformatted node.  Otherwise, we have a direct item, 
-    ** and no zeroing is required on disk.  We zero after the truncate, 
-    ** because the truncate might pack the item anyway 
-    ** (it will unmap bh if it packs).
-    */
-    /* it is enough to reserve space in transaction for 2 balancings:
-       one for "save" link adding and another for the first
-       cut_from_item. 1 is for update_sd */
-    error = journal_begin (&th, p_s_inode->i_sb,
-                           JOURNAL_PER_BALANCE_CNT * 2 + 1);
-    if (error)
-        goto out;
-    reiserfs_update_inode_transaction(p_s_inode) ;
-    if (update_timestamps)
-	    /* we are doing real truncate: if the system crashes before the last
-	       transaction of truncating gets committed - on reboot the file
-	       either appears truncated properly or not truncated at all */
-	add_save_link (&th, p_s_inode, 1);
-    error = reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
-    if (error)
-        goto out;
-    error = journal_end (&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
-    if (error)
-        goto out;
-
-    if (update_timestamps) {
-	error = remove_save_link (p_s_inode, 1/* truncate */);
-        if (error)
-            goto out;
-    }
-
-    if (page) {
-        length = offset & (blocksize - 1) ;
-	/* if we are not on a block boundary */
-	if (length) {
-	    char *kaddr;
-
-	    length = blocksize - length ;
-	    kaddr = kmap_atomic(page, KM_USER0) ;
-	    memset(kaddr + offset, 0, length) ;   
-	    flush_dcache_page(page) ;
-	    kunmap_atomic(kaddr, KM_USER0) ;
-	    if (buffer_mapped(bh) && bh->b_blocknr != 0) {
-	        mark_buffer_dirty(bh) ;
-	    }
-	}
-	unlock_page(page) ;
-	page_cache_release(page) ;
-    }
-
-    reiserfs_write_unlock(p_s_inode->i_sb);
-    return 0;
-out:
-    if (page) {
-        unlock_page (page);
-        page_cache_release (page);
-    }
-    reiserfs_write_unlock(p_s_inode->i_sb);
-    return error;
-}
+int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps)
+{
+	struct reiserfs_transaction_handle th;
+	/* we want the offset for the first byte after the end of the file */
+	unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1);
+	unsigned blocksize = p_s_inode->i_sb->s_blocksize;
+	unsigned length;
+	struct page *page = NULL;
+	int error;
+	struct buffer_head *bh = NULL;
+
+	reiserfs_write_lock(p_s_inode->i_sb);
+
+	if (p_s_inode->i_size > 0) {
+		if ((error = grab_tail_page(p_s_inode, &page, &bh))) {
+			// -ENOENT means we truncated past the end of the file, 
+			// and get_block_create_0 could not find a block to read in,
+			// which is ok.
+			if (error != -ENOENT)
+				reiserfs_warning(p_s_inode->i_sb,
+						 "clm-6001: grab_tail_page failed %d",
+						 error);
+			page = NULL;
+			bh = NULL;
+		}
+	}
 
-static int map_block_for_writepage(struct inode *inode, 
-			       struct buffer_head *bh_result, 
-                               unsigned long block) {
-    struct reiserfs_transaction_handle th ;
-    int fs_gen ;
-    struct item_head tmp_ih ;
-    struct item_head *ih ;
-    struct buffer_head *bh ;
-    __le32 *item ;
-    struct cpu_key key ;
-    INITIALIZE_PATH(path) ;
-    int pos_in_item ;
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT ;
-    loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ;
-    int retval ;
-    int use_get_block = 0 ;
-    int bytes_copied = 0 ;
-    int copy_size ;
-    int trans_running = 0;
-
-    /* catch places below that try to log something without starting a trans */
-    th.t_trans_id = 0;
-
-    if (!buffer_uptodate(bh_result)) {
-	return -EIO;
-    }
-
-    kmap(bh_result->b_page) ;
-start_over:
-    reiserfs_write_lock(inode->i_sb);
-    make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ;
-
-research:
-    retval = search_for_position_by_key(inode->i_sb, &key, &path) ;
-    if (retval != POSITION_FOUND) {
-        use_get_block = 1;
-	goto out ;
-    } 
-
-    bh = get_last_bh(&path) ;
-    ih = get_ih(&path) ;
-    item = get_item(&path) ;
-    pos_in_item = path.pos_in_item ;
-
-    /* we've found an unformatted node */
-    if (indirect_item_found(retval, ih)) {
-	if (bytes_copied > 0) {
-	    reiserfs_warning (inode->i_sb, "clm-6002: bytes_copied %d",
-			      bytes_copied) ;
-	}
-        if (!get_block_num(item, pos_in_item)) {
-	    /* crap, we are writing to a hole */
-	    use_get_block = 1;
-	    goto out ;
-	}
-	set_block_dev_mapped(bh_result, get_block_num(item,pos_in_item),inode);
-    } else if (is_direct_le_ih(ih)) {
-        char *p ; 
-        p = page_address(bh_result->b_page) ;
-        p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ;
-        copy_size = ih_item_len(ih) - pos_in_item;
-
-	fs_gen = get_generation(inode->i_sb) ;
-	copy_item_head(&tmp_ih, ih) ;
-
-	if (!trans_running) {
-	    /* vs-3050 is gone, no need to drop the path */
-	    retval = journal_begin(&th, inode->i_sb, jbegin_count) ;
-            if (retval)
-                goto out;
-	    reiserfs_update_inode_transaction(inode) ;
-	    trans_running = 1;
-	    if (fs_changed(fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) {
-		reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
-		goto research;
-	    }
-	}
-
-	reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ;
-
-	if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) {
-	    reiserfs_restore_prepared_buffer(inode->i_sb, bh) ;
-	    goto research;
-	}
-
-	memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ;
-
-	journal_mark_dirty(&th, inode->i_sb, bh) ;
-	bytes_copied += copy_size ;
-	set_block_dev_mapped(bh_result, 0, inode);
-
-	/* are there still bytes left? */
-        if (bytes_copied < bh_result->b_size && 
-	    (byte_offset + bytes_copied) < inode->i_size) {
-	    set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ;
-	    goto research ;
-	}
-    } else {
-        reiserfs_warning (inode->i_sb,
-			  "clm-6003: bad item inode %lu, device %s",
-			  inode->i_ino, reiserfs_bdevname (inode->i_sb)) ;
-        retval = -EIO ;
-	goto out ;
-    }
-    retval = 0 ;
-    
-out:
-    pathrelse(&path) ;
-    if (trans_running) {
-        int err = journal_end(&th, inode->i_sb, jbegin_count) ;
-        if (err)
-            retval = err;
-	trans_running = 0;
-    }
-    reiserfs_write_unlock(inode->i_sb);
-
-    /* this is where we fill in holes in the file. */
-    if (use_get_block) {
-	retval = reiserfs_get_block(inode, block, bh_result, 
-	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
-				    GET_BLOCK_NO_DANGLE);
-	if (!retval) {
-	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
-	        /* get_block failed to find a mapped unformatted node. */
-		use_get_block = 0 ;
-		goto start_over ;
-	    }
-	}
-    }
-    kunmap(bh_result->b_page) ;
-
-    if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
-	/* we've copied data from the page into the direct item, so the
-	 * buffer in the page is now clean, mark it to reflect that.
+	/* so, if page != NULL, we have a buffer head for the offset at 
+	 ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, 
+	 ** then we have an unformatted node.  Otherwise, we have a direct item, 
+	 ** and no zeroing is required on disk.  We zero after the truncate, 
+	 ** because the truncate might pack the item anyway 
+	 ** (it will unmap bh if it packs).
 	 */
-        lock_buffer(bh_result);
-	clear_buffer_dirty(bh_result);
-	unlock_buffer(bh_result);
-    }
-    return retval ;
+	/* it is enough to reserve space in transaction for 2 balancings:
+	   one for "save" link adding and another for the first
+	   cut_from_item. 1 is for update_sd */
+	error = journal_begin(&th, p_s_inode->i_sb,
+			      JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	if (error)
+		goto out;
+	reiserfs_update_inode_transaction(p_s_inode);
+	if (update_timestamps)
+		/* we are doing real truncate: if the system crashes before the last
+		   transaction of truncating gets committed - on reboot the file
+		   either appears truncated properly or not truncated at all */
+		add_save_link(&th, p_s_inode, 1);
+	error = reiserfs_do_truncate(&th, p_s_inode, page, update_timestamps);
+	if (error)
+		goto out;
+	error =
+	    journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1);
+	if (error)
+		goto out;
+
+	if (update_timestamps) {
+		error = remove_save_link(p_s_inode, 1 /* truncate */ );
+		if (error)
+			goto out;
+	}
+
+	if (page) {
+		length = offset & (blocksize - 1);
+		/* if we are not on a block boundary */
+		if (length) {
+			char *kaddr;
+
+			length = blocksize - length;
+			kaddr = kmap_atomic(page, KM_USER0);
+			memset(kaddr + offset, 0, length);
+			flush_dcache_page(page);
+			kunmap_atomic(kaddr, KM_USER0);
+			if (buffer_mapped(bh) && bh->b_blocknr != 0) {
+				mark_buffer_dirty(bh);
+			}
+		}
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
+	reiserfs_write_unlock(p_s_inode->i_sb);
+	return 0;
+      out:
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+	reiserfs_write_unlock(p_s_inode->i_sb);
+	return error;
+}
+
+static int map_block_for_writepage(struct inode *inode,
+				   struct buffer_head *bh_result,
+				   unsigned long block)
+{
+	struct reiserfs_transaction_handle th;
+	int fs_gen;
+	struct item_head tmp_ih;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	__le32 *item;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	int pos_in_item;
+	int jbegin_count = JOURNAL_PER_BALANCE_CNT;
+	loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1;
+	int retval;
+	int use_get_block = 0;
+	int bytes_copied = 0;
+	int copy_size;
+	int trans_running = 0;
+
+	/* catch places below that try to log something without starting a trans */
+	th.t_trans_id = 0;
+
+	if (!buffer_uptodate(bh_result)) {
+		return -EIO;
+	}
+
+	kmap(bh_result->b_page);
+      start_over:
+	reiserfs_write_lock(inode->i_sb);
+	make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3);
+
+      research:
+	retval = search_for_position_by_key(inode->i_sb, &key, &path);
+	if (retval != POSITION_FOUND) {
+		use_get_block = 1;
+		goto out;
+	}
+
+	bh = get_last_bh(&path);
+	ih = get_ih(&path);
+	item = get_item(&path);
+	pos_in_item = path.pos_in_item;
+
+	/* we've found an unformatted node */
+	if (indirect_item_found(retval, ih)) {
+		if (bytes_copied > 0) {
+			reiserfs_warning(inode->i_sb,
+					 "clm-6002: bytes_copied %d",
+					 bytes_copied);
+		}
+		if (!get_block_num(item, pos_in_item)) {
+			/* crap, we are writing to a hole */
+			use_get_block = 1;
+			goto out;
+		}
+		set_block_dev_mapped(bh_result,
+				     get_block_num(item, pos_in_item), inode);
+	} else if (is_direct_le_ih(ih)) {
+		char *p;
+		p = page_address(bh_result->b_page);
+		p += (byte_offset - 1) & (PAGE_CACHE_SIZE - 1);
+		copy_size = ih_item_len(ih) - pos_in_item;
+
+		fs_gen = get_generation(inode->i_sb);
+		copy_item_head(&tmp_ih, ih);
+
+		if (!trans_running) {
+			/* vs-3050 is gone, no need to drop the path */
+			retval = journal_begin(&th, inode->i_sb, jbegin_count);
+			if (retval)
+				goto out;
+			reiserfs_update_inode_transaction(inode);
+			trans_running = 1;
+			if (fs_changed(fs_gen, inode->i_sb)
+			    && item_moved(&tmp_ih, &path)) {
+				reiserfs_restore_prepared_buffer(inode->i_sb,
+								 bh);
+				goto research;
+			}
+		}
+
+		reiserfs_prepare_for_journal(inode->i_sb, bh, 1);
+
+		if (fs_changed(fs_gen, inode->i_sb)
+		    && item_moved(&tmp_ih, &path)) {
+			reiserfs_restore_prepared_buffer(inode->i_sb, bh);
+			goto research;
+		}
+
+		memcpy(B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied,
+		       copy_size);
+
+		journal_mark_dirty(&th, inode->i_sb, bh);
+		bytes_copied += copy_size;
+		set_block_dev_mapped(bh_result, 0, inode);
+
+		/* are there still bytes left? */
+		if (bytes_copied < bh_result->b_size &&
+		    (byte_offset + bytes_copied) < inode->i_size) {
+			set_cpu_key_k_offset(&key,
+					     cpu_key_k_offset(&key) +
+					     copy_size);
+			goto research;
+		}
+	} else {
+		reiserfs_warning(inode->i_sb,
+				 "clm-6003: bad item inode %lu, device %s",
+				 inode->i_ino, reiserfs_bdevname(inode->i_sb));
+		retval = -EIO;
+		goto out;
+	}
+	retval = 0;
+
+      out:
+	pathrelse(&path);
+	if (trans_running) {
+		int err = journal_end(&th, inode->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		trans_running = 0;
+	}
+	reiserfs_write_unlock(inode->i_sb);
+
+	/* this is where we fill in holes in the file. */
+	if (use_get_block) {
+		retval = reiserfs_get_block(inode, block, bh_result,
+					    GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM
+					    | GET_BLOCK_NO_DANGLE);
+		if (!retval) {
+			if (!buffer_mapped(bh_result)
+			    || bh_result->b_blocknr == 0) {
+				/* get_block failed to find a mapped unformatted node. */
+				use_get_block = 0;
+				goto start_over;
+			}
+		}
+	}
+	kunmap(bh_result->b_page);
+
+	if (!retval && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
+		/* we've copied data from the page into the direct item, so the
+		 * buffer in the page is now clean, mark it to reflect that.
+		 */
+		lock_buffer(bh_result);
+		clear_buffer_dirty(bh_result);
+		unlock_buffer(bh_result);
+	}
+	return retval;
 }
 
 /* 
@@ -2215,383 +2325,390 @@ out:
  * start/recovery path as __block_write_full_page, along with special
  * code to handle reiserfs tails.
  */
-static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) {
-    struct inode *inode = page->mapping->host ;
-    unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    int error = 0;
-    unsigned long block ;
-    struct buffer_head *head, *bh;
-    int partial = 0 ;
-    int nr = 0;
-    int checked = PageChecked(page);
-    struct reiserfs_transaction_handle th;
-    struct super_block *s = inode->i_sb;
-    int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
-    th.t_trans_id = 0;
-
-    /* The page dirty bit is cleared before writepage is called, which
-     * means we have to tell create_empty_buffers to make dirty buffers
-     * The page really should be up to date at this point, so tossing
-     * in the BH_Uptodate is just a sanity check.
-     */
-    if (!page_has_buffers(page)) {
-	create_empty_buffers(page, s->s_blocksize,
-	                    (1 << BH_Dirty) | (1 << BH_Uptodate));
-    }
-    head = page_buffers(page) ;
-
-    /* last page in the file, zero out any contents past the
-    ** last byte in the file
-    */
-    if (page->index >= end_index) {
-	char *kaddr;
-	unsigned last_offset;
-
-        last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ;
-	/* no file contents in this page */
-	if (page->index >= end_index + 1 || !last_offset) {
-    	    unlock_page(page);
-	    return 0;
-	}
-	kaddr = kmap_atomic(page, KM_USER0);
-	memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE-last_offset) ;
-	flush_dcache_page(page) ;
-	kunmap_atomic(kaddr, KM_USER0) ;
-    }
-    bh = head ;
-    block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits) ;
-    /* first map all the buffers, logging any direct items we find */
-    do {
-	if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
-	   (buffer_mapped(bh) && bh->b_blocknr == 0))) {
-	    /* not mapped yet, or it points to a direct item, search
-	     * the btree for the mapping info, and log any direct
-	     * items found
-	     */
-	    if ((error = map_block_for_writepage(inode, bh, block))) {
-		goto fail ;
-	    }
-	}
-        bh = bh->b_this_page;
-	block++;
-    } while(bh != head) ;
-
-    /*
-     * we start the transaction after map_block_for_writepage,
-     * because it can create holes in the file (an unbounded operation).
-     * starting it here, we can make a reliable estimate for how many
-     * blocks we're going to log
-     */
-    if (checked) {
-	ClearPageChecked(page);
-	reiserfs_write_lock(s);
-	error = journal_begin(&th, s, bh_per_page + 1);
-	if (error) {
-	    reiserfs_write_unlock(s);
-	    goto fail;
-	}
-	reiserfs_update_inode_transaction(inode);
-    }
-    /* now go through and lock any dirty buffers on the page */
-    do {
-	get_bh(bh);
-	if (!buffer_mapped(bh))
-	    continue;
-	if (buffer_mapped(bh) && bh->b_blocknr == 0)
-	    continue;
+static int reiserfs_write_full_page(struct page *page,
+				    struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+	int error = 0;
+	unsigned long block;
+	struct buffer_head *head, *bh;
+	int partial = 0;
+	int nr = 0;
+	int checked = PageChecked(page);
+	struct reiserfs_transaction_handle th;
+	struct super_block *s = inode->i_sb;
+	int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize;
+	th.t_trans_id = 0;
+
+	/* The page dirty bit is cleared before writepage is called, which
+	 * means we have to tell create_empty_buffers to make dirty buffers
+	 * The page really should be up to date at this point, so tossing
+	 * in the BH_Uptodate is just a sanity check.
+	 */
+	if (!page_has_buffers(page)) {
+		create_empty_buffers(page, s->s_blocksize,
+				     (1 << BH_Dirty) | (1 << BH_Uptodate));
+	}
+	head = page_buffers(page);
 
-	if (checked) {
-	    reiserfs_prepare_for_journal(s, bh, 1);
-	    journal_mark_dirty(&th, s, bh);
-	    continue;
+	/* last page in the file, zero out any contents past the
+	 ** last byte in the file
+	 */
+	if (page->index >= end_index) {
+		char *kaddr;
+		unsigned last_offset;
+
+		last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1);
+		/* no file contents in this page */
+		if (page->index >= end_index + 1 || !last_offset) {
+			unlock_page(page);
+			return 0;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + last_offset, 0, PAGE_CACHE_SIZE - last_offset);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
 	}
-	/* from this point on, we know the buffer is mapped to a
-	 * real block and not a direct item
+	bh = head;
+	block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits);
+	/* first map all the buffers, logging any direct items we find */
+	do {
+		if ((checked || buffer_dirty(bh)) && (!buffer_mapped(bh) ||
+						      (buffer_mapped(bh)
+						       && bh->b_blocknr ==
+						       0))) {
+			/* not mapped yet, or it points to a direct item, search
+			 * the btree for the mapping info, and log any direct
+			 * items found
+			 */
+			if ((error = map_block_for_writepage(inode, bh, block))) {
+				goto fail;
+			}
+		}
+		bh = bh->b_this_page;
+		block++;
+	} while (bh != head);
+
+	/*
+	 * we start the transaction after map_block_for_writepage,
+	 * because it can create holes in the file (an unbounded operation).
+	 * starting it here, we can make a reliable estimate for how many
+	 * blocks we're going to log
 	 */
-	if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
-	    lock_buffer(bh);
-	} else {
-	    if (test_set_buffer_locked(bh)) {
-		redirty_page_for_writepage(wbc, page);
-		continue;
-	    }
+	if (checked) {
+		ClearPageChecked(page);
+		reiserfs_write_lock(s);
+		error = journal_begin(&th, s, bh_per_page + 1);
+		if (error) {
+			reiserfs_write_unlock(s);
+			goto fail;
+		}
+		reiserfs_update_inode_transaction(inode);
 	}
-	if (test_clear_buffer_dirty(bh)) {
-	    mark_buffer_async_write(bh);
-	} else {
-	    unlock_buffer(bh);
+	/* now go through and lock any dirty buffers on the page */
+	do {
+		get_bh(bh);
+		if (!buffer_mapped(bh))
+			continue;
+		if (buffer_mapped(bh) && bh->b_blocknr == 0)
+			continue;
+
+		if (checked) {
+			reiserfs_prepare_for_journal(s, bh, 1);
+			journal_mark_dirty(&th, s, bh);
+			continue;
+		}
+		/* from this point on, we know the buffer is mapped to a
+		 * real block and not a direct item
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else {
+			if (test_set_buffer_locked(bh)) {
+				redirty_page_for_writepage(wbc, page);
+				continue;
+			}
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	if (checked) {
+		error = journal_end(&th, s, bh_per_page + 1);
+		reiserfs_write_unlock(s);
+		if (error)
+			goto fail;
 	}
-    } while((bh = bh->b_this_page) != head);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
 
-    if (checked) {
-	error = journal_end(&th, s, bh_per_page + 1);
-	reiserfs_write_unlock(s);
-	if (error)
-	    goto fail;
-    }
-    BUG_ON(PageWriteback(page));
-    set_page_writeback(page);
-    unlock_page(page);
-
-    /*
-     * since any buffer might be the only dirty buffer on the page, 
-     * the first submit_bh can bring the page out of writeback.
-     * be careful with the buffers.
-     */
-    do {
-        struct buffer_head *next = bh->b_this_page;
-	if (buffer_async_write(bh)) {
-	    submit_bh(WRITE, bh);
-	    nr++;
-	}
-	put_bh(bh);
-	bh = next;
-    } while(bh != head);
-
-    error = 0;
-done:
-    if (nr == 0) {
-        /*
-         * if this page only had a direct item, it is very possible for
-         * no io to be required without there being an error.  Or, 
-	 * someone else could have locked them and sent them down the 
-	 * pipe without locking the page
+	/*
+	 * since any buffer might be the only dirty buffer on the page, 
+	 * the first submit_bh can bring the page out of writeback.
+	 * be careful with the buffers.
 	 */
-	bh = head ;
 	do {
-	    if (!buffer_uptodate(bh)) {
-	        partial = 1;
-		break;
-	    }
-	    bh = bh->b_this_page;
-	} while(bh != head);
-	if (!partial)
-	    SetPageUptodate(page);
-	end_page_writeback(page);
-    }
-    return error;
-
-fail:
-    /* catches various errors, we need to make sure any valid dirty blocks
-     * get to the media.  The page is currently locked and not marked for 
-     * writeback
-     */
-    ClearPageUptodate(page);
-    bh = head;
-    do {
-	get_bh(bh);
-	if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
-	    lock_buffer(bh);
-	    mark_buffer_async_write(bh);
-	} else {
-	    /*
-	     * clear any dirty bits that might have come from getting
-	     * attached to a dirty page
-	     */
-	     clear_buffer_dirty(bh);
-	}
-        bh = bh->b_this_page;
-    } while(bh != head);
-    SetPageError(page);
-    BUG_ON(PageWriteback(page));
-    set_page_writeback(page);
-    unlock_page(page);
-    do {
-        struct buffer_head *next = bh->b_this_page;
-	if (buffer_async_write(bh)) {
-	    clear_buffer_dirty(bh);
-	    submit_bh(WRITE, bh);
-	    nr++;
-	}
-	put_bh(bh);
-	bh = next;
-    } while(bh != head);
-    goto done;
-}
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
 
+	error = 0;
+      done:
+	if (nr == 0) {
+		/*
+		 * if this page only had a direct item, it is very possible for
+		 * no io to be required without there being an error.  Or, 
+		 * someone else could have locked them and sent them down the 
+		 * pipe without locking the page
+		 */
+		bh = head;
+		do {
+			if (!buffer_uptodate(bh)) {
+				partial = 1;
+				break;
+			}
+			bh = bh->b_this_page;
+		} while (bh != head);
+		if (!partial)
+			SetPageUptodate(page);
+		end_page_writeback(page);
+	}
+	return error;
 
-static int reiserfs_readpage (struct file *f, struct page * page)
-{
-    return block_read_full_page (page, reiserfs_get_block);
+      fail:
+	/* catches various errors, we need to make sure any valid dirty blocks
+	 * get to the media.  The page is currently locked and not marked for 
+	 * writeback
+	 */
+	ClearPageUptodate(page);
+	bh = head;
+	do {
+		get_bh(bh);
+		if (buffer_mapped(bh) && buffer_dirty(bh) && bh->b_blocknr) {
+			lock_buffer(bh);
+			mark_buffer_async_write(bh);
+		} else {
+			/*
+			 * clear any dirty bits that might have come from getting
+			 * attached to a dirty page
+			 */
+			clear_buffer_dirty(bh);
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	SetPageError(page);
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+	unlock_page(page);
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			clear_buffer_dirty(bh);
+			submit_bh(WRITE, bh);
+			nr++;
+		}
+		put_bh(bh);
+		bh = next;
+	} while (bh != head);
+	goto done;
 }
 
+static int reiserfs_readpage(struct file *f, struct page *page)
+{
+	return block_read_full_page(page, reiserfs_get_block);
+}
 
-static int reiserfs_writepage (struct page * page, struct writeback_control *wbc)
+static int reiserfs_writepage(struct page *page, struct writeback_control *wbc)
 {
-    struct inode *inode = page->mapping->host ;
-    reiserfs_wait_on_write_block(inode->i_sb) ;
-    return reiserfs_write_full_page(page, wbc) ;
+	struct inode *inode = page->mapping->host;
+	reiserfs_wait_on_write_block(inode->i_sb);
+	return reiserfs_write_full_page(page, wbc);
 }
 
 static int reiserfs_prepare_write(struct file *f, struct page *page,
-			   unsigned from, unsigned to) {
-    struct inode *inode = page->mapping->host ;
-    int ret;
-    int old_ref = 0;
-
-    reiserfs_wait_on_write_block(inode->i_sb) ;
-    fix_tail_page_for_writing(page) ;
-    if (reiserfs_transaction_running(inode->i_sb)) {
-	struct reiserfs_transaction_handle *th;
-	th = (struct reiserfs_transaction_handle *)current->journal_info;
-        BUG_ON (!th->t_refcount);
-        BUG_ON (!th->t_trans_id);
-	old_ref = th->t_refcount;
-	th->t_refcount++;
-    }
-
-    ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
-    if (ret && reiserfs_transaction_running(inode->i_sb)) {
-    	struct reiserfs_transaction_handle *th = current->journal_info;
-	/* this gets a little ugly.  If reiserfs_get_block returned an
-	 * error and left a transacstion running, we've got to close it,
-	 * and we've got to free handle if it was a persistent transaction.
-	 *
-	 * But, if we had nested into an existing transaction, we need
-	 * to just drop the ref count on the handle.
-	 *
-	 * If old_ref == 0, the transaction is from reiserfs_get_block,
-	 * and it was a persistent trans.  Otherwise, it was nested above.
-	 */
-	if (th->t_refcount > old_ref) {
-	    if (old_ref)
-	    	th->t_refcount--;
-	    else {
-                int err;
-		reiserfs_write_lock(inode->i_sb);
-		err = reiserfs_end_persistent_transaction(th);
-		reiserfs_write_unlock(inode->i_sb);
-                if (err)
-                    ret = err;
-	    }
+				  unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	int ret;
+	int old_ref = 0;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	fix_tail_page_for_writing(page);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th;
+		th = (struct reiserfs_transaction_handle *)current->
+		    journal_info;
+		BUG_ON(!th->t_refcount);
+		BUG_ON(!th->t_trans_id);
+		old_ref = th->t_refcount;
+		th->t_refcount++;
 	}
-    }
-    return ret;
 
-}
+	ret = block_prepare_write(page, from, to, reiserfs_get_block);
+	if (ret && reiserfs_transaction_running(inode->i_sb)) {
+		struct reiserfs_transaction_handle *th = current->journal_info;
+		/* this gets a little ugly.  If reiserfs_get_block returned an
+		 * error and left a transacstion running, we've got to close it,
+		 * and we've got to free handle if it was a persistent transaction.
+		 *
+		 * But, if we had nested into an existing transaction, we need
+		 * to just drop the ref count on the handle.
+		 *
+		 * If old_ref == 0, the transaction is from reiserfs_get_block,
+		 * and it was a persistent trans.  Otherwise, it was nested above.
+		 */
+		if (th->t_refcount > old_ref) {
+			if (old_ref)
+				th->t_refcount--;
+			else {
+				int err;
+				reiserfs_write_lock(inode->i_sb);
+				err = reiserfs_end_persistent_transaction(th);
+				reiserfs_write_unlock(inode->i_sb);
+				if (err)
+					ret = err;
+			}
+		}
+	}
+	return ret;
 
+}
 
-static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block) {
-  return generic_block_bmap(as, block, reiserfs_bmap) ;
+static sector_t reiserfs_aop_bmap(struct address_space *as, sector_t block)
+{
+	return generic_block_bmap(as, block, reiserfs_bmap);
 }
 
-static int reiserfs_commit_write(struct file *f, struct page *page, 
-                                 unsigned from, unsigned to) {
-    struct inode *inode = page->mapping->host ;
-    loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-    int ret = 0;
-    int update_sd = 0;
-    struct reiserfs_transaction_handle *th = NULL;
-    
-    reiserfs_wait_on_write_block(inode->i_sb) ;
-    if (reiserfs_transaction_running(inode->i_sb)) {
-        th = current->journal_info;
-    }
-    reiserfs_commit_page(inode, page, from, to);
- 
-    /* generic_commit_write does this for us, but does not update the
-    ** transaction tracking stuff when the size changes.  So, we have
-    ** to do the i_size updates here.
-    */
-    if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle myth ;
-	reiserfs_write_lock(inode->i_sb);
-	/* If the file have grown beyond the border where it
-	   can have a tail, unmark it as needing a tail
-	   packing */
-	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-
-	ret = journal_begin(&myth, inode->i_sb, 1) ;
-	if (ret) {
-	    reiserfs_write_unlock(inode->i_sb);
-	    goto journal_error;
-	}
-	reiserfs_update_inode_transaction(inode) ;
-	inode->i_size = pos ;
-	reiserfs_update_sd(&myth, inode) ;
-	update_sd = 1;
-	ret = journal_end(&myth, inode->i_sb, 1) ;
-	reiserfs_write_unlock(inode->i_sb);
-	if (ret)
-	    goto journal_error;
-    }
-    if (th) {
-	reiserfs_write_lock(inode->i_sb);
-	if (!update_sd)
-	    reiserfs_update_sd(th, inode) ;
-	ret = reiserfs_end_persistent_transaction(th);
-	reiserfs_write_unlock(inode->i_sb);
-	if (ret)
-	    goto out;
-    }
- 
-    /* we test for O_SYNC here so we can commit the transaction
-    ** for any packed tails the file might have had
-    */
-    if (f && (f->f_flags & O_SYNC)) {
-	reiserfs_write_lock(inode->i_sb);
- 	ret = reiserfs_commit_for_inode(inode) ;
-	reiserfs_write_unlock(inode->i_sb);
-    }
-out:
-    return ret ;
+static int reiserfs_commit_write(struct file *f, struct page *page,
+				 unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t) page->index << PAGE_CACHE_SHIFT) + to;
+	int ret = 0;
+	int update_sd = 0;
+	struct reiserfs_transaction_handle *th = NULL;
+
+	reiserfs_wait_on_write_block(inode->i_sb);
+	if (reiserfs_transaction_running(inode->i_sb)) {
+		th = current->journal_info;
+	}
+	reiserfs_commit_page(inode, page, from, to);
 
-journal_error:
-    if (th) {
-	reiserfs_write_lock(inode->i_sb);
-	if (!update_sd)
-	    reiserfs_update_sd(th, inode) ;
-        ret = reiserfs_end_persistent_transaction(th);
-	reiserfs_write_unlock(inode->i_sb);
-    }
+	/* generic_commit_write does this for us, but does not update the
+	 ** transaction tracking stuff when the size changes.  So, we have
+	 ** to do the i_size updates here.
+	 */
+	if (pos > inode->i_size) {
+		struct reiserfs_transaction_handle myth;
+		reiserfs_write_lock(inode->i_sb);
+		/* If the file have grown beyond the border where it
+		   can have a tail, unmark it as needing a tail
+		   packing */
+		if ((have_large_tails(inode->i_sb)
+		     && inode->i_size > i_block_size(inode) * 4)
+		    || (have_small_tails(inode->i_sb)
+			&& inode->i_size > i_block_size(inode)))
+			REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
+
+		ret = journal_begin(&myth, inode->i_sb, 1);
+		if (ret) {
+			reiserfs_write_unlock(inode->i_sb);
+			goto journal_error;
+		}
+		reiserfs_update_inode_transaction(inode);
+		inode->i_size = pos;
+		reiserfs_update_sd(&myth, inode);
+		update_sd = 1;
+		ret = journal_end(&myth, inode->i_sb, 1);
+		reiserfs_write_unlock(inode->i_sb);
+		if (ret)
+			goto journal_error;
+	}
+	if (th) {
+		reiserfs_write_lock(inode->i_sb);
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		reiserfs_write_unlock(inode->i_sb);
+		if (ret)
+			goto out;
+	}
+
+	/* we test for O_SYNC here so we can commit the transaction
+	 ** for any packed tails the file might have had
+	 */
+	if (f && (f->f_flags & O_SYNC)) {
+		reiserfs_write_lock(inode->i_sb);
+		ret = reiserfs_commit_for_inode(inode);
+		reiserfs_write_unlock(inode->i_sb);
+	}
+      out:
+	return ret;
 
-    return ret;
+      journal_error:
+	if (th) {
+		reiserfs_write_lock(inode->i_sb);
+		if (!update_sd)
+			reiserfs_update_sd(th, inode);
+		ret = reiserfs_end_persistent_transaction(th);
+		reiserfs_write_unlock(inode->i_sb);
+	}
+
+	return ret;
 }
 
-void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode)
 {
-	if( reiserfs_attrs( inode -> i_sb ) ) {
-		if( sd_attrs & REISERFS_SYNC_FL )
-			inode -> i_flags |= S_SYNC;
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (sd_attrs & REISERFS_SYNC_FL)
+			inode->i_flags |= S_SYNC;
 		else
-			inode -> i_flags &= ~S_SYNC;
-		if( sd_attrs & REISERFS_IMMUTABLE_FL )
-			inode -> i_flags |= S_IMMUTABLE;
+			inode->i_flags &= ~S_SYNC;
+		if (sd_attrs & REISERFS_IMMUTABLE_FL)
+			inode->i_flags |= S_IMMUTABLE;
 		else
-			inode -> i_flags &= ~S_IMMUTABLE;
-		if( sd_attrs & REISERFS_APPEND_FL )
-			inode -> i_flags |= S_APPEND;
+			inode->i_flags &= ~S_IMMUTABLE;
+		if (sd_attrs & REISERFS_APPEND_FL)
+			inode->i_flags |= S_APPEND;
 		else
-			inode -> i_flags &= ~S_APPEND;
-		if( sd_attrs & REISERFS_NOATIME_FL )
-			inode -> i_flags |= S_NOATIME;
+			inode->i_flags &= ~S_APPEND;
+		if (sd_attrs & REISERFS_NOATIME_FL)
+			inode->i_flags |= S_NOATIME;
 		else
-			inode -> i_flags &= ~S_NOATIME;
-		if( sd_attrs & REISERFS_NOTAIL_FL )
+			inode->i_flags &= ~S_NOATIME;
+		if (sd_attrs & REISERFS_NOTAIL_FL)
 			REISERFS_I(inode)->i_flags |= i_nopack_mask;
 		else
 			REISERFS_I(inode)->i_flags &= ~i_nopack_mask;
 	}
 }
 
-void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs)
 {
-	if( reiserfs_attrs( inode -> i_sb ) ) {
-		if( inode -> i_flags & S_IMMUTABLE )
+	if (reiserfs_attrs(inode->i_sb)) {
+		if (inode->i_flags & S_IMMUTABLE)
 			*sd_attrs |= REISERFS_IMMUTABLE_FL;
 		else
 			*sd_attrs &= ~REISERFS_IMMUTABLE_FL;
-		if( inode -> i_flags & S_SYNC )
+		if (inode->i_flags & S_SYNC)
 			*sd_attrs |= REISERFS_SYNC_FL;
 		else
 			*sd_attrs &= ~REISERFS_SYNC_FL;
-		if( inode -> i_flags & S_NOATIME )
+		if (inode->i_flags & S_NOATIME)
 			*sd_attrs |= REISERFS_NOATIME_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOATIME_FL;
-		if( REISERFS_I(inode)->i_flags & i_nopack_mask )
+		if (REISERFS_I(inode)->i_flags & i_nopack_mask)
 			*sd_attrs |= REISERFS_NOTAIL_FL;
 		else
 			*sd_attrs &= ~REISERFS_NOTAIL_FL;
@@ -2603,106 +2720,107 @@ void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
 */
 static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
 {
-    int ret = 1 ;
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-
-    spin_lock(&j->j_dirty_buffers_lock) ;
-    if (!buffer_mapped(bh)) {
-        goto free_jh;
-    }
-    /* the page is locked, and the only places that log a data buffer
-     * also lock the page.
-     */
-    if (reiserfs_file_data_log(inode)) {
-	/*
-	 * very conservative, leave the buffer pinned if
-	 * anyone might need it.
-	 */
-        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-	    ret = 0 ;
-	}
-    } else
-    if (buffer_dirty(bh) || buffer_locked(bh)) {
-	struct reiserfs_journal_list *jl;
-	struct reiserfs_jh *jh = bh->b_private;
-
-	/* why is this safe?
-	 * reiserfs_setattr updates i_size in the on disk
-	 * stat data before allowing vmtruncate to be called.
-	 *
-	 * If buffer was put onto the ordered list for this
-	 * transaction, we know for sure either this transaction
-	 * or an older one already has updated i_size on disk,
-	 * and this ordered data won't be referenced in the file
-	 * if we crash.
-	 *
-	 * if the buffer was put onto the ordered list for an older
-	 * transaction, we need to leave it around
+	int ret = 1;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!buffer_mapped(bh)) {
+		goto free_jh;
+	}
+	/* the page is locked, and the only places that log a data buffer
+	 * also lock the page.
 	 */
-	if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
-	    ret = 0;
-    }
-free_jh:
-    if (ret && bh->b_private) {
-        reiserfs_free_jh(bh);
-    }
-    spin_unlock(&j->j_dirty_buffers_lock) ;
-    return ret ;
+	if (reiserfs_file_data_log(inode)) {
+		/*
+		 * very conservative, leave the buffer pinned if
+		 * anyone might need it.
+		 */
+		if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+			ret = 0;
+		}
+	} else if (buffer_dirty(bh) || buffer_locked(bh)) {
+		struct reiserfs_journal_list *jl;
+		struct reiserfs_jh *jh = bh->b_private;
+
+		/* why is this safe?
+		 * reiserfs_setattr updates i_size in the on disk
+		 * stat data before allowing vmtruncate to be called.
+		 *
+		 * If buffer was put onto the ordered list for this
+		 * transaction, we know for sure either this transaction
+		 * or an older one already has updated i_size on disk,
+		 * and this ordered data won't be referenced in the file
+		 * if we crash.
+		 *
+		 * if the buffer was put onto the ordered list for an older
+		 * transaction, we need to leave it around
+		 */
+		if (jh && (jl = jh->jl)
+		    && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+			ret = 0;
+	}
+      free_jh:
+	if (ret && bh->b_private) {
+		reiserfs_free_jh(bh);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return ret;
 }
 
 /* clm -- taken from fs/buffer.c:block_invalidate_page */
 static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
 {
-    struct buffer_head *head, *bh, *next;
-    struct inode *inode = page->mapping->host;
-    unsigned int curr_off = 0;
-    int ret = 1;
+	struct buffer_head *head, *bh, *next;
+	struct inode *inode = page->mapping->host;
+	unsigned int curr_off = 0;
+	int ret = 1;
 
-    BUG_ON(!PageLocked(page));
+	BUG_ON(!PageLocked(page));
 
-    if (offset == 0)
-	ClearPageChecked(page);
+	if (offset == 0)
+		ClearPageChecked(page);
 
-    if (!page_has_buffers(page))
-	goto out;
+	if (!page_has_buffers(page))
+		goto out;
+
+	head = page_buffers(page);
+	bh = head;
+	do {
+		unsigned int next_off = curr_off + bh->b_size;
+		next = bh->b_this_page;
 
-    head = page_buffers(page);
-    bh = head;
-    do {
-	unsigned int next_off = curr_off + bh->b_size;
-	next = bh->b_this_page;
+		/*
+		 * is this block fully invalidated?
+		 */
+		if (offset <= curr_off) {
+			if (invalidatepage_can_drop(inode, bh))
+				reiserfs_unmap_buffer(bh);
+			else
+				ret = 0;
+		}
+		curr_off = next_off;
+		bh = next;
+	} while (bh != head);
 
 	/*
-	 * is this block fully invalidated?
+	 * We release buffers only if the entire page is being invalidated.
+	 * The get_block cached value has been unconditionally invalidated,
+	 * so real IO is not possible anymore.
 	 */
-	if (offset <= curr_off) {
-	    if (invalidatepage_can_drop(inode, bh))
-		reiserfs_unmap_buffer(bh);
-	    else
-	        ret = 0;
-	}
-	curr_off = next_off;
-	bh = next;
-    } while (bh != head);
-
-    /*
-     * We release buffers only if the entire page is being invalidated.
-     * The get_block cached value has been unconditionally invalidated,
-     * so real IO is not possible anymore.
-     */
-    if (!offset && ret)
-	ret = try_to_release_page(page, 0);
-out:
-    return ret;
+	if (!offset && ret)
+		ret = try_to_release_page(page, 0);
+      out:
+	return ret;
 }
 
-static int reiserfs_set_page_dirty(struct page *page) {
-    struct inode *inode = page->mapping->host;
-    if (reiserfs_file_data_log(inode)) {
-	SetPageChecked(page);
-	return __set_page_dirty_nobuffers(page);
-    }
-    return __set_page_dirty_buffers(page);
+static int reiserfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	if (reiserfs_file_data_log(inode)) {
+		SetPageChecked(page);
+		return __set_page_dirty_nobuffers(page);
+	}
+	return __set_page_dirty_buffers(page);
 }
 
 /*
@@ -2716,143 +2834,152 @@ static int reiserfs_set_page_dirty(struct page *page) {
  */
 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 {
-    struct inode *inode = page->mapping->host ;
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-    struct buffer_head *head ;
-    struct buffer_head *bh ;
-    int ret = 1 ;
-
-    WARN_ON(PageChecked(page));
-    spin_lock(&j->j_dirty_buffers_lock) ;
-    head = page_buffers(page) ;
-    bh = head ;
-    do {
-	if (bh->b_private) {
-	    if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-		reiserfs_free_jh(bh);
-	    } else {
-		ret = 0 ;
-		break ;
-	    }
-	}
-	bh = bh->b_this_page ;
-    } while (bh != head) ;
-    if (ret)
-	ret = try_to_free_buffers(page) ;
-    spin_unlock(&j->j_dirty_buffers_lock) ;
-    return ret ;
+	struct inode *inode = page->mapping->host;
+	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+	struct buffer_head *head;
+	struct buffer_head *bh;
+	int ret = 1;
+
+	WARN_ON(PageChecked(page));
+	spin_lock(&j->j_dirty_buffers_lock);
+	head = page_buffers(page);
+	bh = head;
+	do {
+		if (bh->b_private) {
+			if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+				reiserfs_free_jh(bh);
+			} else {
+				ret = 0;
+				break;
+			}
+		}
+		bh = bh->b_this_page;
+	} while (bh != head);
+	if (ret)
+		ret = try_to_free_buffers(page);
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return ret;
 }
 
 /* We thank Mingming Cao for helping us understand in great detail what
    to do in this section of the code. */
 static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb,
-		const struct iovec *iov, loff_t offset, unsigned long nr_segs)
+				  const struct iovec *iov, loff_t offset,
+				  unsigned long nr_segs)
 {
-    struct file *file = iocb->ki_filp;
-    struct inode *inode = file->f_mapping->host;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
 
-    return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
-			offset, nr_segs, reiserfs_get_blocks_direct_io, NULL);
+	return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+				  offset, nr_segs,
+				  reiserfs_get_blocks_direct_io, NULL);
 }
 
-int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) {
-    struct inode *inode = dentry->d_inode ;
-    int error ;
-    unsigned int ia_valid = attr->ia_valid;
-    reiserfs_write_lock(inode->i_sb);
-    if (attr->ia_valid & ATTR_SIZE) {
-	/* version 2 items will be caught by the s_maxbytes check
-	** done for us in vmtruncate
-	*/
-	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
-	    attr->ia_size > MAX_NON_LFS) {
-	    error = -EFBIG ;
-	    goto out;
-	}
-	/* fill in hole pointers in the expanding truncate case. */
-        if (attr->ia_size > inode->i_size) {
-	    error = generic_cont_expand(inode, attr->ia_size) ;
-	    if (REISERFS_I(inode)->i_prealloc_count > 0) {
-		int err;
-		struct reiserfs_transaction_handle th ;
-		/* we're changing at most 2 bitmaps, inode + super */
-		err = journal_begin(&th, inode->i_sb, 4) ;
-		if (!err) {
-		    reiserfs_discard_prealloc (&th, inode);
-		    err = journal_end(&th, inode->i_sb, 4) ;
+int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+	unsigned int ia_valid = attr->ia_valid;
+	reiserfs_write_lock(inode->i_sb);
+	if (attr->ia_valid & ATTR_SIZE) {
+		/* version 2 items will be caught by the s_maxbytes check
+		 ** done for us in vmtruncate
+		 */
+		if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 &&
+		    attr->ia_size > MAX_NON_LFS) {
+			error = -EFBIG;
+			goto out;
+		}
+		/* fill in hole pointers in the expanding truncate case. */
+		if (attr->ia_size > inode->i_size) {
+			error = generic_cont_expand(inode, attr->ia_size);
+			if (REISERFS_I(inode)->i_prealloc_count > 0) {
+				int err;
+				struct reiserfs_transaction_handle th;
+				/* we're changing at most 2 bitmaps, inode + super */
+				err = journal_begin(&th, inode->i_sb, 4);
+				if (!err) {
+					reiserfs_discard_prealloc(&th, inode);
+					err = journal_end(&th, inode->i_sb, 4);
+				}
+				if (err)
+					error = err;
+			}
+			if (error)
+				goto out;
 		}
-		if (err)
-		    error = err;
-	    }
-	    if (error)
-	        goto out;
 	}
-    }
 
-    if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
-	 ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
-	(get_inode_sd_version (inode) == STAT_DATA_V1)) {
+	if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) ||
+	     ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) &&
+	    (get_inode_sd_version(inode) == STAT_DATA_V1)) {
 		/* stat data of format v3.5 has 16 bit uid and gid */
-	    error = -EINVAL;
-	    goto out;
-	}
-
-    error = inode_change_ok(inode, attr) ;
-    if (!error) {
-	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-	    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                error = reiserfs_chown_xattrs (inode, attr);
-
-                if (!error) {
-		    struct reiserfs_transaction_handle th;
-		    int jbegin_count = 2*(REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb)+REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb))+2;
-
-		    /* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
-		    error = journal_begin(&th, inode->i_sb, jbegin_count);
- 		    if (error)
- 			goto out;
-                    error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
-		    if (error) {
-			journal_end(&th, inode->i_sb, jbegin_count);
-			goto out;
-		    }
-		    /* Update corresponding info in inode so that everything is in
-		     * one transaction */
-		    if (attr->ia_valid & ATTR_UID)
-			inode->i_uid = attr->ia_uid;
-		    if (attr->ia_valid & ATTR_GID)
-			inode->i_gid = attr->ia_gid;
-		    mark_inode_dirty(inode);
-		    error = journal_end(&th, inode->i_sb, jbegin_count);
-		}
-        }
-        if (!error)
-            error = inode_setattr(inode, attr) ;
-    }
+		error = -EINVAL;
+		goto out;
+	}
 
+	error = inode_change_ok(inode, attr);
+	if (!error) {
+		if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+		    (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+			error = reiserfs_chown_xattrs(inode, attr);
+
+			if (!error) {
+				struct reiserfs_transaction_handle th;
+				int jbegin_count =
+				    2 *
+				    (REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb) +
+				     REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)) +
+				    2;
+
+				/* (user+group)*(old+new) structure - we count quota info and , inode write (sb, inode) */
+				error =
+				    journal_begin(&th, inode->i_sb,
+						  jbegin_count);
+				if (error)
+					goto out;
+				error =
+				    DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
+				if (error) {
+					journal_end(&th, inode->i_sb,
+						    jbegin_count);
+					goto out;
+				}
+				/* Update corresponding info in inode so that everything is in
+				 * one transaction */
+				if (attr->ia_valid & ATTR_UID)
+					inode->i_uid = attr->ia_uid;
+				if (attr->ia_valid & ATTR_GID)
+					inode->i_gid = attr->ia_gid;
+				mark_inode_dirty(inode);
+				error =
+				    journal_end(&th, inode->i_sb, jbegin_count);
+			}
+		}
+		if (!error)
+			error = inode_setattr(inode, attr);
+	}
 
-    if (!error && reiserfs_posixacl (inode->i_sb)) {
-        if (attr->ia_valid & ATTR_MODE)
-            error = reiserfs_acl_chmod (inode);
-    }
+	if (!error && reiserfs_posixacl(inode->i_sb)) {
+		if (attr->ia_valid & ATTR_MODE)
+			error = reiserfs_acl_chmod(inode);
+	}
 
-out:
-    reiserfs_write_unlock(inode->i_sb);
-    return error ;
+      out:
+	reiserfs_write_unlock(inode->i_sb);
+	return error;
 }
 
-
-
 struct address_space_operations reiserfs_address_space_operations = {
-    .writepage = reiserfs_writepage,
-    .readpage = reiserfs_readpage, 
-    .readpages = reiserfs_readpages, 
-    .releasepage = reiserfs_releasepage,
-    .invalidatepage = reiserfs_invalidatepage,
-    .sync_page = block_sync_page,
-    .prepare_write = reiserfs_prepare_write,
-    .commit_write = reiserfs_commit_write,
-    .bmap = reiserfs_aop_bmap,
-    .direct_IO = reiserfs_direct_IO,
-    .set_page_dirty = reiserfs_set_page_dirty,
-} ;
+	.writepage = reiserfs_writepage,
+	.readpage = reiserfs_readpage,
+	.readpages = reiserfs_readpages,
+	.releasepage = reiserfs_releasepage,
+	.invalidatepage = reiserfs_invalidatepage,
+	.sync_page = block_sync_page,
+	.prepare_write = reiserfs_prepare_write,
+	.commit_write = reiserfs_commit_write,
+	.bmap = reiserfs_aop_bmap,
+	.direct_IO = reiserfs_direct_IO,
+	.set_page_dirty = reiserfs_set_page_dirty,
+};
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 76caedf737f2..81fc00285f60 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -9,7 +9,7 @@
 #include <linux/pagemap.h>
 #include <linux/smp_lock.h>
 
-static int reiserfs_unpack (struct inode * inode, struct file * filp);
+static int reiserfs_unpack(struct inode *inode, struct file *filp);
 
 /*
 ** reiserfs_ioctl - handler for ioctl for inode
@@ -19,69 +19,72 @@ static int reiserfs_unpack (struct inode * inode, struct file * filp);
 **  2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
 **  3) That's all for a while ...
 */
-int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
-		unsigned long arg)
+int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+		   unsigned long arg)
 {
 	unsigned int flags;
 
 	switch (cmd) {
-	    case REISERFS_IOC_UNPACK:
-		if( S_ISREG( inode -> i_mode ) ) {
-		if (arg)
-		    return reiserfs_unpack (inode, filp);
+	case REISERFS_IOC_UNPACK:
+		if (S_ISREG(inode->i_mode)) {
+			if (arg)
+				return reiserfs_unpack(inode, filp);
 			else
 				return 0;
 		} else
 			return -ENOTTY;
-	/* following two cases are taken from fs/ext2/ioctl.c by Remy
-	   Card (card@masi.ibp.fr) */
+		/* following two cases are taken from fs/ext2/ioctl.c by Remy
+		   Card (card@masi.ibp.fr) */
 	case REISERFS_IOC_GETFLAGS:
-		if (!reiserfs_attrs (inode->i_sb))
+		if (!reiserfs_attrs(inode->i_sb))
 			return -ENOTTY;
 
-		flags = REISERFS_I(inode) -> i_attrs;
-		i_attrs_to_sd_attrs( inode, ( __u16 * ) &flags );
-		return put_user(flags, (int __user *) arg);
-	case REISERFS_IOC_SETFLAGS: {
-		if (!reiserfs_attrs (inode->i_sb))
-			return -ENOTTY;
+		flags = REISERFS_I(inode)->i_attrs;
+		i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
+		return put_user(flags, (int __user *)arg);
+	case REISERFS_IOC_SETFLAGS:{
+			if (!reiserfs_attrs(inode->i_sb))
+				return -ENOTTY;
 
-		if (IS_RDONLY(inode))
-			return -EROFS;
+			if (IS_RDONLY(inode))
+				return -EROFS;
 
-		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
-			return -EPERM;
+			if ((current->fsuid != inode->i_uid)
+			    && !capable(CAP_FOWNER))
+				return -EPERM;
 
-		if (get_user(flags, (int __user *) arg))
-			return -EFAULT;
+			if (get_user(flags, (int __user *)arg))
+				return -EFAULT;
 
-		if ( ( ( flags ^ REISERFS_I(inode) -> i_attrs) & ( REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) &&
-		     !capable( CAP_LINUX_IMMUTABLE ) )
-			return -EPERM;
-			
-		if( ( flags & REISERFS_NOTAIL_FL ) &&
-		    S_ISREG( inode -> i_mode ) ) {
+			if (((flags ^ REISERFS_I(inode)->
+			      i_attrs) & (REISERFS_IMMUTABLE_FL |
+					  REISERFS_APPEND_FL))
+			    && !capable(CAP_LINUX_IMMUTABLE))
+				return -EPERM;
+
+			if ((flags & REISERFS_NOTAIL_FL) &&
+			    S_ISREG(inode->i_mode)) {
 				int result;
 
-				result = reiserfs_unpack( inode, filp );
-				if( result )
+				result = reiserfs_unpack(inode, filp);
+				if (result)
 					return result;
+			}
+			sd_attrs_to_i_attrs(flags, inode);
+			REISERFS_I(inode)->i_attrs = flags;
+			inode->i_ctime = CURRENT_TIME_SEC;
+			mark_inode_dirty(inode);
+			return 0;
 		}
-		sd_attrs_to_i_attrs( flags, inode );
-		REISERFS_I(inode) -> i_attrs = flags;
-		inode->i_ctime = CURRENT_TIME_SEC;
-		mark_inode_dirty(inode);
-		return 0;
-	}
 	case REISERFS_IOC_GETVERSION:
-		return put_user(inode->i_generation, (int __user *) arg);
+		return put_user(inode->i_generation, (int __user *)arg);
 	case REISERFS_IOC_SETVERSION:
 		if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
 			return -EPERM;
 		if (IS_RDONLY(inode))
 			return -EROFS;
-		if (get_user(inode->i_generation, (int __user *) arg))
-			return -EFAULT;	
+		if (get_user(inode->i_generation, (int __user *)arg))
+			return -EFAULT;
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
 		return 0;
@@ -95,63 +98,65 @@ int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
 ** Function try to convert tail from direct item into indirect.
 ** It set up nopack attribute in the REISERFS_I(inode)->nopack
 */
-static int reiserfs_unpack (struct inode * inode, struct file * filp)
+static int reiserfs_unpack(struct inode *inode, struct file *filp)
 {
-    int retval = 0;
-    int index ;
-    struct page *page ;
-    struct address_space *mapping ;
-    unsigned long write_from ;
-    unsigned long blocksize = inode->i_sb->s_blocksize ;
-    	
-    if (inode->i_size == 0) {
-        REISERFS_I(inode)->i_flags |= i_nopack_mask;
-        return 0 ;
-    }
-    /* ioctl already done */
-    if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
-        return 0 ;
-    }
-    reiserfs_write_lock(inode->i_sb);
-
-    /* we need to make sure nobody is changing the file size beneath
-    ** us
-    */
-    down(&inode->i_sem) ;
-
-    write_from = inode->i_size & (blocksize - 1) ;
-    /* if we are on a block boundary, we are already unpacked.  */
-    if ( write_from == 0) {
+	int retval = 0;
+	int index;
+	struct page *page;
+	struct address_space *mapping;
+	unsigned long write_from;
+	unsigned long blocksize = inode->i_sb->s_blocksize;
+
+	if (inode->i_size == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		return 0;
+	}
+	/* ioctl already done */
+	if (REISERFS_I(inode)->i_flags & i_nopack_mask) {
+		return 0;
+	}
+	reiserfs_write_lock(inode->i_sb);
+
+	/* we need to make sure nobody is changing the file size beneath
+	 ** us
+	 */
+	down(&inode->i_sem);
+
+	write_from = inode->i_size & (blocksize - 1);
+	/* if we are on a block boundary, we are already unpacked.  */
+	if (write_from == 0) {
+		REISERFS_I(inode)->i_flags |= i_nopack_mask;
+		goto out;
+	}
+
+	/* we unpack by finding the page with the tail, and calling
+	 ** reiserfs_prepare_write on that page.  This will force a 
+	 ** reiserfs_get_block to unpack the tail for us.
+	 */
+	index = inode->i_size >> PAGE_CACHE_SHIFT;
+	mapping = inode->i_mapping;
+	page = grab_cache_page(mapping, index);
+	retval = -ENOMEM;
+	if (!page) {
+		goto out;
+	}
+	retval =
+	    mapping->a_ops->prepare_write(NULL, page, write_from, write_from);
+	if (retval)
+		goto out_unlock;
+
+	/* conversion can change page contents, must flush */
+	flush_dcache_page(page);
+	retval =
+	    mapping->a_ops->commit_write(NULL, page, write_from, write_from);
 	REISERFS_I(inode)->i_flags |= i_nopack_mask;
-	goto out ;
-    }
-
-    /* we unpack by finding the page with the tail, and calling
-    ** reiserfs_prepare_write on that page.  This will force a 
-    ** reiserfs_get_block to unpack the tail for us.
-    */
-    index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    mapping = inode->i_mapping ;
-    page = grab_cache_page(mapping, index) ;
-    retval = -ENOMEM;
-    if (!page) {
-        goto out ;
-    }
-    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
-    if (retval)
-        goto out_unlock ;
-
-    /* conversion can change page contents, must flush */
-    flush_dcache_page(page) ;
-    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
-    REISERFS_I(inode)->i_flags |= i_nopack_mask;
-
-out_unlock:
-    unlock_page(page) ;
-    page_cache_release(page) ;
-
-out:
-    up(&inode->i_sem) ;
-    reiserfs_write_unlock(inode->i_sb);
-    return retval;
+
+      out_unlock:
+	unlock_page(page);
+	page_cache_release(page);
+
+      out:
+	up(&inode->i_sem);
+	reiserfs_write_unlock(inode->i_sb);
+	return retval;
 }
diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c
index e477aeba8c92..e237cd668e5b 100644
--- a/fs/reiserfs/item_ops.c
+++ b/fs/reiserfs/item_ops.c
@@ -14,760 +14,729 @@
 //////////////////////////////////////////////////////////////////////////////
 // stat data functions
 //
-static int sd_bytes_number (struct item_head * ih, int block_size)
+static int sd_bytes_number(struct item_head *ih, int block_size)
 {
-  return 0;
+	return 0;
 }
 
-static void sd_decrement_key (struct cpu_key * key)
+static void sd_decrement_key(struct cpu_key *key)
 {
-    key->on_disk_key.k_objectid --;
-    set_cpu_key_k_type (key, TYPE_ANY);
-    set_cpu_key_k_offset(key, (loff_t)(-1));
+	key->on_disk_key.k_objectid--;
+	set_cpu_key_k_type(key, TYPE_ANY);
+	set_cpu_key_k_offset(key, (loff_t) (-1));
 }
 
-static int sd_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize)
 {
-    return 0;
+	return 0;
 }
 
-
-
-static char * print_time (time_t t)
+static char *print_time(time_t t)
 {
-    static char timebuf[256];
+	static char timebuf[256];
 
-    sprintf (timebuf, "%ld", t);
-    return timebuf;
+	sprintf(timebuf, "%ld", t);
+	return timebuf;
 }
 
-
-static void sd_print_item (struct item_head * ih, char * item)
+static void sd_print_item(struct item_head *ih, char *item)
 {
-    printk ("\tmode | size | nlinks | first direct | mtime\n");
-    if (stat_data_v1 (ih)) {
-      	struct stat_data_v1 * sd = (struct stat_data_v1 *)item;
+	printk("\tmode | size | nlinks | first direct | mtime\n");
+	if (stat_data_v1(ih)) {
+		struct stat_data_v1 *sd = (struct stat_data_v1 *)item;
 
-	printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
-                sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd),
-                print_time( sd_v1_mtime(sd) ) );
-    } else {
-	struct stat_data * sd = (struct stat_data *)item;
+		printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd),
+		       sd_v1_size(sd), sd_v1_nlink(sd),
+		       sd_v1_first_direct_byte(sd),
+		       print_time(sd_v1_mtime(sd)));
+	} else {
+		struct stat_data *sd = (struct stat_data *)item;
 
-	printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
-            (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
-            sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
-    }
+		printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd),
+		       (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd),
+		       sd_v2_rdev(sd), print_time(sd_v2_mtime(sd)));
+	}
 }
 
-static void sd_check_item (struct item_head * ih, char * item)
+static void sd_check_item(struct item_head *ih, char *item)
 {
-    // FIXME: type something here!
+	// FIXME: type something here!
 }
 
-
-static int sd_create_vi (struct virtual_node * vn,
-			 struct virtual_item * vi, 
-			 int is_affected, 
-			 int insert_size)
+static int sd_create_vi(struct virtual_node *vn,
+			struct virtual_item *vi,
+			int is_affected, int insert_size)
 {
-    vi->vi_index = TYPE_STAT_DATA;
-    //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
-    return 0;
+	vi->vi_index = TYPE_STAT_DATA;
+	//vi->vi_type |= VI_TYPE_STAT_DATA;// not needed?
+	return 0;
 }
 
-
-static int sd_check_left (struct virtual_item * vi, int free, 
-			  int start_skip, int end_skip)
+static int sd_check_left(struct virtual_item *vi, int free,
+			 int start_skip, int end_skip)
 {
-    if (start_skip || end_skip)
-	BUG ();
-    return -1;
+	if (start_skip || end_skip)
+		BUG();
+	return -1;
 }
 
-
-static int sd_check_right (struct virtual_item * vi, int free)
+static int sd_check_right(struct virtual_item *vi, int free)
 {
-    return -1;
+	return -1;
 }
 
-static int sd_part_size (struct virtual_item * vi, int first, int count)
+static int sd_part_size(struct virtual_item *vi, int first, int count)
 {
-    if (count)
-	BUG ();
-    return 0;
+	if (count)
+		BUG();
+	return 0;
 }
 
-static int sd_unit_num (struct virtual_item * vi)
+static int sd_unit_num(struct virtual_item *vi)
 {
-    return vi->vi_item_len - IH_SIZE;
+	return vi->vi_item_len - IH_SIZE;
 }
 
-
-static void sd_print_vi (struct virtual_item * vi)
+static void sd_print_vi(struct virtual_item *vi)
 {
-    reiserfs_warning (NULL, "STATDATA, index %d, type 0x%x, %h",
-		      vi->vi_index, vi->vi_type, vi->vi_ih);
+	reiserfs_warning(NULL, "STATDATA, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
 static struct item_operations stat_data_ops = {
-	.bytes_number		= sd_bytes_number,
-	.decrement_key		= sd_decrement_key,
-	.is_left_mergeable	= sd_is_left_mergeable,
-	.print_item		= sd_print_item,
-	.check_item		= sd_check_item,
-
-	.create_vi		= sd_create_vi,
-	.check_left		= sd_check_left,
-	.check_right		= sd_check_right,
-	.part_size		= sd_part_size,
-	.unit_num		= sd_unit_num,
-	.print_vi		= sd_print_vi
+	.bytes_number = sd_bytes_number,
+	.decrement_key = sd_decrement_key,
+	.is_left_mergeable = sd_is_left_mergeable,
+	.print_item = sd_print_item,
+	.check_item = sd_check_item,
+
+	.create_vi = sd_create_vi,
+	.check_left = sd_check_left,
+	.check_right = sd_check_right,
+	.part_size = sd_part_size,
+	.unit_num = sd_unit_num,
+	.print_vi = sd_print_vi
 };
 
-
-
 //////////////////////////////////////////////////////////////////////////////
 // direct item functions
 //
-static int direct_bytes_number (struct item_head * ih, int block_size)
+static int direct_bytes_number(struct item_head *ih, int block_size)
 {
-  return ih_item_len(ih);
+	return ih_item_len(ih);
 }
 
-
 // FIXME: this should probably switch to indirect as well
-static void direct_decrement_key (struct cpu_key * key)
+static void direct_decrement_key(struct cpu_key *key)
 {
-    cpu_key_k_offset_dec (key);
-    if (cpu_key_k_offset (key) == 0)
-	set_cpu_key_k_type (key, TYPE_STAT_DATA);	
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
 }
 
-
-static int direct_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+static int direct_is_left_mergeable(struct reiserfs_key *key,
+				    unsigned long bsize)
 {
-    int version = le_key_version (key);
-    return ((le_key_k_offset (version, key) & (bsize - 1)) != 1);
+	int version = le_key_version(key);
+	return ((le_key_k_offset(version, key) & (bsize - 1)) != 1);
 }
 
-
-static void direct_print_item (struct item_head * ih, char * item)
+static void direct_print_item(struct item_head *ih, char *item)
 {
-    int j = 0;
+	int j = 0;
 
 //    return;
-    printk ("\"");
-    while (j < ih_item_len(ih))
-	printk ("%c", item[j++]);
-    printk ("\"\n");
+	printk("\"");
+	while (j < ih_item_len(ih))
+		printk("%c", item[j++]);
+	printk("\"\n");
 }
 
-
-static void direct_check_item (struct item_head * ih, char * item)
+static void direct_check_item(struct item_head *ih, char *item)
 {
-    // FIXME: type something here!
+	// FIXME: type something here!
 }
 
-
-static int direct_create_vi (struct virtual_node * vn,
-			     struct virtual_item * vi, 
-			     int is_affected, 
-			     int insert_size)
+static int direct_create_vi(struct virtual_node *vn,
+			    struct virtual_item *vi,
+			    int is_affected, int insert_size)
 {
-    vi->vi_index = TYPE_DIRECT;
-    //vi->vi_type |= VI_TYPE_DIRECT;
-    return 0;
+	vi->vi_index = TYPE_DIRECT;
+	//vi->vi_type |= VI_TYPE_DIRECT;
+	return 0;
 }
 
-static int direct_check_left (struct virtual_item * vi, int free,
-			      int start_skip, int end_skip)
+static int direct_check_left(struct virtual_item *vi, int free,
+			     int start_skip, int end_skip)
 {
-    int bytes;
+	int bytes;
 
-    bytes = free - free % 8;
-    return bytes ?: -1;    
+	bytes = free - free % 8;
+	return bytes ? : -1;
 }
 
-
-static int direct_check_right (struct virtual_item * vi, int free)
+static int direct_check_right(struct virtual_item *vi, int free)
 {
-    return direct_check_left (vi, free, 0, 0);
+	return direct_check_left(vi, free, 0, 0);
 }
 
-static int direct_part_size (struct virtual_item * vi, int first, int count)
+static int direct_part_size(struct virtual_item *vi, int first, int count)
 {
-    return count;
+	return count;
 }
 
-
-static int direct_unit_num (struct virtual_item * vi)
+static int direct_unit_num(struct virtual_item *vi)
 {
-    return vi->vi_item_len - IH_SIZE;
+	return vi->vi_item_len - IH_SIZE;
 }
 
-
-static void direct_print_vi (struct virtual_item * vi)
+static void direct_print_vi(struct virtual_item *vi)
 {
-    reiserfs_warning (NULL, "DIRECT, index %d, type 0x%x, %h",
-		      vi->vi_index, vi->vi_type, vi->vi_ih);
+	reiserfs_warning(NULL, "DIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
 static struct item_operations direct_ops = {
-	.bytes_number		= direct_bytes_number,
-	.decrement_key		= direct_decrement_key,
-	.is_left_mergeable	= direct_is_left_mergeable,
-	.print_item		= direct_print_item,
-	.check_item		= direct_check_item,
-
-	.create_vi		= direct_create_vi,
-	.check_left		= direct_check_left,
-	.check_right		= direct_check_right,
-	.part_size		= direct_part_size,
-	.unit_num		= direct_unit_num,
-	.print_vi		= direct_print_vi
+	.bytes_number = direct_bytes_number,
+	.decrement_key = direct_decrement_key,
+	.is_left_mergeable = direct_is_left_mergeable,
+	.print_item = direct_print_item,
+	.check_item = direct_check_item,
+
+	.create_vi = direct_create_vi,
+	.check_left = direct_check_left,
+	.check_right = direct_check_right,
+	.part_size = direct_part_size,
+	.unit_num = direct_unit_num,
+	.print_vi = direct_print_vi
 };
 
-
-
 //////////////////////////////////////////////////////////////////////////////
 // indirect item functions
 //
 
-static int indirect_bytes_number (struct item_head * ih, int block_size)
+static int indirect_bytes_number(struct item_head *ih, int block_size)
 {
-  return ih_item_len(ih) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih);
+	return ih_item_len(ih) / UNFM_P_SIZE * block_size;	//- get_ih_free_space (ih);
 }
 
-
 // decrease offset, if it becomes 0, change type to stat data
-static void indirect_decrement_key (struct cpu_key * key)
+static void indirect_decrement_key(struct cpu_key *key)
 {
-    cpu_key_k_offset_dec (key);
-    if (cpu_key_k_offset (key) == 0)
-	set_cpu_key_k_type (key, TYPE_STAT_DATA);
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
 }
 
-
 // if it is not first item of the body, then it is mergeable
-static int indirect_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+static int indirect_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
 {
-    int version = le_key_version (key);
-    return (le_key_k_offset (version, key) != 1);
+	int version = le_key_version(key);
+	return (le_key_k_offset(version, key) != 1);
 }
 
-
 // printing of indirect item
-static void start_new_sequence (__u32 * start, int * len, __u32 new)
+static void start_new_sequence(__u32 * start, int *len, __u32 new)
 {
-    *start = new;
-    *len = 1;
+	*start = new;
+	*len = 1;
 }
 
-
-static int sequence_finished (__u32 start, int * len, __u32 new)
+static int sequence_finished(__u32 start, int *len, __u32 new)
 {
-    if (start == INT_MAX)
-	return 1;
+	if (start == INT_MAX)
+		return 1;
 
-    if (start == 0 && new == 0) {
-	(*len) ++;
-	return 0;
-    }
-    if (start != 0 && (start + *len) == new) {
-	(*len) ++;
-	return 0;
-    }
-    return 1;
+	if (start == 0 && new == 0) {
+		(*len)++;
+		return 0;
+	}
+	if (start != 0 && (start + *len) == new) {
+		(*len)++;
+		return 0;
+	}
+	return 1;
 }
 
-static void print_sequence (__u32 start, int len)
+static void print_sequence(__u32 start, int len)
 {
-    if (start == INT_MAX)
-	return;
+	if (start == INT_MAX)
+		return;
 
-    if (len == 1)
-	printk (" %d", start);
-    else
-	printk (" %d(%d)", start, len);
+	if (len == 1)
+		printk(" %d", start);
+	else
+		printk(" %d(%d)", start, len);
 }
 
-
-static void indirect_print_item (struct item_head * ih, char * item)
+static void indirect_print_item(struct item_head *ih, char *item)
 {
-    int j;
-    __le32 * unp;
-    __u32 prev = INT_MAX;
-    int num;
+	int j;
+	__le32 *unp;
+	__u32 prev = INT_MAX;
+	int num;
 
-    unp = (__le32 *)item;
+	unp = (__le32 *) item;
 
-    if (ih_item_len(ih) % UNFM_P_SIZE)
-	reiserfs_warning (NULL, "indirect_print_item: invalid item len");
+	if (ih_item_len(ih) % UNFM_P_SIZE)
+		reiserfs_warning(NULL, "indirect_print_item: invalid item len");
 
-    printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih));
-    for (j = 0; j < I_UNFM_NUM (ih); j ++) {
-	if (sequence_finished (prev, &num, get_block_num(unp, j))) {
-	    print_sequence (prev, num);
-	    start_new_sequence (&prev, &num, get_block_num(unp, j));
+	printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih));
+	for (j = 0; j < I_UNFM_NUM(ih); j++) {
+		if (sequence_finished(prev, &num, get_block_num(unp, j))) {
+			print_sequence(prev, num);
+			start_new_sequence(&prev, &num, get_block_num(unp, j));
+		}
 	}
-    }
-    print_sequence (prev, num);
-    printk ("]\n");
+	print_sequence(prev, num);
+	printk("]\n");
 }
 
-static void indirect_check_item (struct item_head * ih, char * item)
+static void indirect_check_item(struct item_head *ih, char *item)
 {
-    // FIXME: type something here!
+	// FIXME: type something here!
 }
 
-
-static int indirect_create_vi (struct virtual_node * vn,
-			       struct virtual_item * vi, 
-			       int is_affected, 
-			       int insert_size)
+static int indirect_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
 {
-    vi->vi_index = TYPE_INDIRECT;
-    //vi->vi_type |= VI_TYPE_INDIRECT;
-    return 0;
+	vi->vi_index = TYPE_INDIRECT;
+	//vi->vi_type |= VI_TYPE_INDIRECT;
+	return 0;
 }
 
-static int indirect_check_left (struct virtual_item * vi, int free,
-				int start_skip, int end_skip)
+static int indirect_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
 {
-    int bytes;
+	int bytes;
 
-    bytes = free - free % UNFM_P_SIZE;
-    return bytes ?: -1;    
+	bytes = free - free % UNFM_P_SIZE;
+	return bytes ? : -1;
 }
 
-
-static int indirect_check_right (struct virtual_item * vi, int free)
+static int indirect_check_right(struct virtual_item *vi, int free)
 {
-    return indirect_check_left (vi, free, 0, 0);
+	return indirect_check_left(vi, free, 0, 0);
 }
 
-
-
 // return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right)
-static int indirect_part_size (struct virtual_item * vi, int first, int units)
+static int indirect_part_size(struct virtual_item *vi, int first, int units)
 {
-    // unit of indirect item is byte (yet)
-    return units;
+	// unit of indirect item is byte (yet)
+	return units;
 }
 
-static int indirect_unit_num (struct virtual_item * vi)
+static int indirect_unit_num(struct virtual_item *vi)
 {
-    // unit of indirect item is byte (yet)
-    return vi->vi_item_len - IH_SIZE;
+	// unit of indirect item is byte (yet)
+	return vi->vi_item_len - IH_SIZE;
 }
 
-static void indirect_print_vi (struct virtual_item * vi)
+static void indirect_print_vi(struct virtual_item *vi)
 {
-    reiserfs_warning (NULL, "INDIRECT, index %d, type 0x%x, %h",
-		      vi->vi_index, vi->vi_type, vi->vi_ih);
+	reiserfs_warning(NULL, "INDIRECT, index %d, type 0x%x, %h",
+			 vi->vi_index, vi->vi_type, vi->vi_ih);
 }
 
 static struct item_operations indirect_ops = {
-	.bytes_number		= indirect_bytes_number,
-	.decrement_key		= indirect_decrement_key,
-	.is_left_mergeable	= indirect_is_left_mergeable,
-	.print_item		= indirect_print_item,
-	.check_item		= indirect_check_item,
-
-	.create_vi		= indirect_create_vi,
-	.check_left		= indirect_check_left,
-	.check_right		= indirect_check_right,
-	.part_size		= indirect_part_size,
-	.unit_num		= indirect_unit_num,
-	.print_vi		= indirect_print_vi
+	.bytes_number = indirect_bytes_number,
+	.decrement_key = indirect_decrement_key,
+	.is_left_mergeable = indirect_is_left_mergeable,
+	.print_item = indirect_print_item,
+	.check_item = indirect_check_item,
+
+	.create_vi = indirect_create_vi,
+	.check_left = indirect_check_left,
+	.check_right = indirect_check_right,
+	.part_size = indirect_part_size,
+	.unit_num = indirect_unit_num,
+	.print_vi = indirect_print_vi
 };
 
-
 //////////////////////////////////////////////////////////////////////////////
 // direntry functions
 //
 
-
-static int direntry_bytes_number (struct item_head * ih, int block_size)
+static int direntry_bytes_number(struct item_head *ih, int block_size)
 {
-    reiserfs_warning (NULL, "vs-16090: direntry_bytes_number: "
-		      "bytes number is asked for direntry");
-    return 0;
-}
-
-static void direntry_decrement_key (struct cpu_key * key)
-{
-    cpu_key_k_offset_dec (key);
-    if (cpu_key_k_offset (key) == 0)
-	set_cpu_key_k_type (key, TYPE_STAT_DATA);	
+	reiserfs_warning(NULL, "vs-16090: direntry_bytes_number: "
+			 "bytes number is asked for direntry");
+	return 0;
 }
 
-
-static int direntry_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+static void direntry_decrement_key(struct cpu_key *key)
 {
-    if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET)
-	return 0;
-    return 1;
-	
+	cpu_key_k_offset_dec(key);
+	if (cpu_key_k_offset(key) == 0)
+		set_cpu_key_k_type(key, TYPE_STAT_DATA);
 }
 
-
-static void direntry_print_item (struct item_head * ih, char * item)
+static int direntry_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
 {
-    int i;
-    int namelen;
-    struct reiserfs_de_head * deh;
-    char * name;
-    static char namebuf [80];
-
-
-    printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status");
+	if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET)
+		return 0;
+	return 1;
 
-    deh = (struct reiserfs_de_head *)item;
+}
 
-    for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
-	namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh);
-	name = item + deh_location(deh);
-	if (name[namelen-1] == 0)
-	  namelen = strlen (name);
-	namebuf[0] = '"';
-	if (namelen > sizeof (namebuf) - 3) {
-	    strncpy (namebuf + 1, name, sizeof (namebuf) - 3);
-	    namebuf[sizeof (namebuf) - 2] = '"';
-	    namebuf[sizeof (namebuf) - 1] = 0;
-	} else {
-	    memcpy (namebuf + 1, name, namelen);
-	    namebuf[namelen + 1] = '"';
-	    namebuf[namelen + 2] = 0;
+static void direntry_print_item(struct item_head *ih, char *item)
+{
+	int i;
+	int namelen;
+	struct reiserfs_de_head *deh;
+	char *name;
+	static char namebuf[80];
+
+	printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name",
+	       "Key of pointed object", "Hash", "Gen number", "Status");
+
+	deh = (struct reiserfs_de_head *)item;
+
+	for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+		namelen =
+		    (i ? (deh_location(deh - 1)) : ih_item_len(ih)) -
+		    deh_location(deh);
+		name = item + deh_location(deh);
+		if (name[namelen - 1] == 0)
+			namelen = strlen(name);
+		namebuf[0] = '"';
+		if (namelen > sizeof(namebuf) - 3) {
+			strncpy(namebuf + 1, name, sizeof(namebuf) - 3);
+			namebuf[sizeof(namebuf) - 2] = '"';
+			namebuf[sizeof(namebuf) - 1] = 0;
+		} else {
+			memcpy(namebuf + 1, name, namelen);
+			namebuf[namelen + 1] = '"';
+			namebuf[namelen + 2] = 0;
+		}
+
+		printk("%d:  %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n",
+		       i, namebuf,
+		       deh_dir_id(deh), deh_objectid(deh),
+		       GET_HASH_VALUE(deh_offset(deh)),
+		       GET_GENERATION_NUMBER((deh_offset(deh))),
+		       (de_hidden(deh)) ? "HIDDEN" : "VISIBLE");
 	}
-
-	printk ("%d:  %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", 
-		i, namebuf,
-		deh_dir_id(deh), deh_objectid(deh),
-		GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))),
-		(de_hidden (deh)) ? "HIDDEN" : "VISIBLE");
-    }
 }
 
-
-static void direntry_check_item (struct item_head * ih, char * item)
+static void direntry_check_item(struct item_head *ih, char *item)
 {
-    int i;
-    struct reiserfs_de_head * deh;
+	int i;
+	struct reiserfs_de_head *deh;
 
-    // FIXME: type something here!
-    deh = (struct reiserfs_de_head *)item;
-    for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) {
-	;
-    }
+	// FIXME: type something here!
+	deh = (struct reiserfs_de_head *)item;
+	for (i = 0; i < I_ENTRY_COUNT(ih); i++, deh++) {
+		;
+	}
 }
 
-
-
 #define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1
 
 /*
  * function returns old entry number in directory item in real node
  * using new entry number in virtual item in virtual node */
-static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode)
+static inline int old_entry_num(int is_affected, int virtual_entry_num,
+				int pos_in_item, int mode)
 {
-    if ( mode == M_INSERT || mode == M_DELETE)
-	return virtual_entry_num;
-    
-    if (!is_affected)
-	/* cut or paste is applied to another item */
-	return virtual_entry_num;
-
-    if (virtual_entry_num < pos_in_item)
-	return virtual_entry_num;
+	if (mode == M_INSERT || mode == M_DELETE)
+		return virtual_entry_num;
 
-    if (mode == M_CUT)
-	return virtual_entry_num + 1;
+	if (!is_affected)
+		/* cut or paste is applied to another item */
+		return virtual_entry_num;
 
-    RFALSE( mode != M_PASTE || virtual_entry_num == 0,
-	    "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode);
-    
-    return virtual_entry_num - 1;
-}
+	if (virtual_entry_num < pos_in_item)
+		return virtual_entry_num;
 
+	if (mode == M_CUT)
+		return virtual_entry_num + 1;
 
+	RFALSE(mode != M_PASTE || virtual_entry_num == 0,
+	       "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'",
+	       mode);
 
+	return virtual_entry_num - 1;
+}
 
 /* Create an array of sizes of directory entries for virtual
    item. Return space used by an item. FIXME: no control over
    consuming of space used by this item handler */
-static int direntry_create_vi (struct virtual_node * vn,
-			       struct virtual_item * vi, 
-			       int is_affected, 
-			       int insert_size)
-{
-    struct direntry_uarea * dir_u = vi->vi_uarea;
-    int i, j;
-    int size = sizeof (struct direntry_uarea);
-    struct reiserfs_de_head * deh;
-  
-    vi->vi_index = TYPE_DIRENTRY;
-
-    if (!(vi->vi_ih) || !vi->vi_item)
-	BUG ();
-
-
-    dir_u->flags = 0;
-    if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET)
-	dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
-
-    deh = (struct reiserfs_de_head *)(vi->vi_item);
-    
-    
-    /* virtual directory item have this amount of entry after */
-    dir_u->entry_count = ih_entry_count (vi->vi_ih) + 
-	((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
-			  (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
-    
-    for (i = 0; i < dir_u->entry_count; i ++) {
-	j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode);
-        dir_u->entry_sizes[i] = (j ? deh_location( &(deh[j - 1]) ) :
-                                ih_item_len (vi->vi_ih)) -
-                                deh_location( &(deh[j])) + DEH_SIZE;
-    }
-
-    size += (dir_u->entry_count * sizeof (short));
-    
-    /* set size of pasted entry */
-    if (is_affected && vn->vn_mode == M_PASTE)
-	dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
+static int direntry_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
+	int i, j;
+	int size = sizeof(struct direntry_uarea);
+	struct reiserfs_de_head *deh;
 
+	vi->vi_index = TYPE_DIRENTRY;
+
+	if (!(vi->vi_ih) || !vi->vi_item)
+		BUG();
+
+	dir_u->flags = 0;
+	if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET)
+		dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM;
+
+	deh = (struct reiserfs_de_head *)(vi->vi_item);
+
+	/* virtual directory item have this amount of entry after */
+	dir_u->entry_count = ih_entry_count(vi->vi_ih) +
+	    ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 :
+			      (vn->vn_mode == M_PASTE ? 1 : 0)) : 0);
+
+	for (i = 0; i < dir_u->entry_count; i++) {
+		j = old_entry_num(is_affected, i, vn->vn_pos_in_item,
+				  vn->vn_mode);
+		dir_u->entry_sizes[i] =
+		    (j ? deh_location(&(deh[j - 1])) : ih_item_len(vi->vi_ih)) -
+		    deh_location(&(deh[j])) + DEH_SIZE;
+	}
+
+	size += (dir_u->entry_count * sizeof(short));
+
+	/* set size of pasted entry */
+	if (is_affected && vn->vn_mode == M_PASTE)
+		dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size;
 
 #ifdef CONFIG_REISERFS_CHECK
-    /* compare total size of entries with item length */
-    {
-	int k, l;
-    
-	l = 0;
-	for (k = 0; k < dir_u->entry_count; k ++)
-	    l += dir_u->entry_sizes[k];
-    
-	if (l + IH_SIZE != vi->vi_item_len + 
-	    ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) {
-	    reiserfs_panic (NULL, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
-			    vn->vn_mode, insert_size);
+	/* compare total size of entries with item length */
+	{
+		int k, l;
+
+		l = 0;
+		for (k = 0; k < dir_u->entry_count; k++)
+			l += dir_u->entry_sizes[k];
+
+		if (l + IH_SIZE != vi->vi_item_len +
+		    ((is_affected
+		      && (vn->vn_mode == M_PASTE
+			  || vn->vn_mode == M_CUT)) ? insert_size : 0)) {
+			reiserfs_panic(NULL,
+				       "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item",
+				       vn->vn_mode, insert_size);
+		}
 	}
-    }
 #endif
 
-    return size;
-
+	return size;
 
 }
 
-
 //
 // return number of entries which may fit into specified amount of
 // free space, or -1 if free space is not enough even for 1 entry
 //
-static int direntry_check_left (struct virtual_item * vi, int free,
-				int start_skip, int end_skip)
+static int direntry_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
 {
-    int i;
-    int entries = 0;
-    struct direntry_uarea * dir_u = vi->vi_uarea;
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-    for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) {
-	if (dir_u->entry_sizes[i] > free)
-	    /* i-th entry doesn't fit into the remaining free space */
-	    break;
-		  
-	free -= dir_u->entry_sizes[i];
-	entries ++;
-    }
+	for (i = start_skip; i < dir_u->entry_count - end_skip; i++) {
+		if (dir_u->entry_sizes[i] > free)
+			/* i-th entry doesn't fit into the remaining free space */
+			break;
 
-    if (entries == dir_u->entry_count) {
-	reiserfs_panic (NULL, "free space %d, entry_count %d\n", free, dir_u->entry_count);
-    }
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
 
-    /* "." and ".." can not be separated from each other */
-    if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2)
-	entries = 0;
-    
-    return entries ?: -1;
-}
+	if (entries == dir_u->entry_count) {
+		reiserfs_panic(NULL, "free space %d, entry_count %d\n", free,
+			       dir_u->entry_count);
+	}
 
+	/* "." and ".." can not be separated from each other */
+	if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries < 2)
+		entries = 0;
 
-static int direntry_check_right (struct virtual_item * vi, int free)
+	return entries ? : -1;
+}
+
+static int direntry_check_right(struct virtual_item *vi, int free)
 {
-    int i;
-    int entries = 0;
-    struct direntry_uarea * dir_u = vi->vi_uarea;
-    
-    for (i = dir_u->entry_count - 1; i >= 0; i --) {
-	if (dir_u->entry_sizes[i] > free)
-	    /* i-th entry doesn't fit into the remaining free space */
-	    break;
-	
-	free -= dir_u->entry_sizes[i];
-	entries ++;
-    }
-    if (entries == dir_u->entry_count)
-	BUG ();
+	int i;
+	int entries = 0;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-    /* "." and ".." can not be separated from each other */
-    if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2)
-	entries = dir_u->entry_count - 2;
+	for (i = dir_u->entry_count - 1; i >= 0; i--) {
+		if (dir_u->entry_sizes[i] > free)
+			/* i-th entry doesn't fit into the remaining free space */
+			break;
 
-    return entries ?: -1;
-}
+		free -= dir_u->entry_sizes[i];
+		entries++;
+	}
+	if (entries == dir_u->entry_count)
+		BUG();
 
+	/* "." and ".." can not be separated from each other */
+	if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM)
+	    && entries > dir_u->entry_count - 2)
+		entries = dir_u->entry_count - 2;
+
+	return entries ? : -1;
+}
 
 /* sum of entry sizes between from-th and to-th entries including both edges */
-static int direntry_part_size (struct virtual_item * vi, int first, int count)
+static int direntry_part_size(struct virtual_item *vi, int first, int count)
 {
-    int i, retval;
-    int from, to;
-    struct direntry_uarea * dir_u = vi->vi_uarea;
-    
-    retval = 0;
-    if (first == 0)
-	from = 0;
-    else
-	from = dir_u->entry_count - count;
-    to = from + count - 1;
+	int i, retval;
+	int from, to;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-    for (i = from; i <= to; i ++)
-	retval += dir_u->entry_sizes[i];
+	retval = 0;
+	if (first == 0)
+		from = 0;
+	else
+		from = dir_u->entry_count - count;
+	to = from + count - 1;
 
-    return retval;
-}
+	for (i = from; i <= to; i++)
+		retval += dir_u->entry_sizes[i];
 
-static int direntry_unit_num (struct virtual_item * vi)
-{
-    struct direntry_uarea * dir_u = vi->vi_uarea;
-    
-    return dir_u->entry_count;
+	return retval;
 }
 
+static int direntry_unit_num(struct virtual_item *vi)
+{
+	struct direntry_uarea *dir_u = vi->vi_uarea;
 
+	return dir_u->entry_count;
+}
 
-static void direntry_print_vi (struct virtual_item * vi)
+static void direntry_print_vi(struct virtual_item *vi)
 {
-    int i;
-    struct direntry_uarea * dir_u = vi->vi_uarea;
+	int i;
+	struct direntry_uarea *dir_u = vi->vi_uarea;
 
-    reiserfs_warning (NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
-		      vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
-    printk ("%d entries: ", dir_u->entry_count);
-    for (i = 0; i < dir_u->entry_count; i ++)
-	printk ("%d ", dir_u->entry_sizes[i]);
-    printk ("\n");
+	reiserfs_warning(NULL, "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x",
+			 vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags);
+	printk("%d entries: ", dir_u->entry_count);
+	for (i = 0; i < dir_u->entry_count; i++)
+		printk("%d ", dir_u->entry_sizes[i]);
+	printk("\n");
 }
 
 static struct item_operations direntry_ops = {
-	.bytes_number		= direntry_bytes_number,
-	.decrement_key		= direntry_decrement_key,
-	.is_left_mergeable	= direntry_is_left_mergeable,
-	.print_item		= direntry_print_item,
-	.check_item		= direntry_check_item,
-
-	.create_vi		= direntry_create_vi,
-	.check_left		= direntry_check_left,
-	.check_right		= direntry_check_right,
-	.part_size		= direntry_part_size,
-	.unit_num		= direntry_unit_num,
-	.print_vi		= direntry_print_vi
+	.bytes_number = direntry_bytes_number,
+	.decrement_key = direntry_decrement_key,
+	.is_left_mergeable = direntry_is_left_mergeable,
+	.print_item = direntry_print_item,
+	.check_item = direntry_check_item,
+
+	.create_vi = direntry_create_vi,
+	.check_left = direntry_check_left,
+	.check_right = direntry_check_right,
+	.part_size = direntry_part_size,
+	.unit_num = direntry_unit_num,
+	.print_vi = direntry_print_vi
 };
 
-
 //////////////////////////////////////////////////////////////////////////////
 // Error catching functions to catch errors caused by incorrect item types.
 //
-static int errcatch_bytes_number (struct item_head * ih, int block_size)
+static int errcatch_bytes_number(struct item_head *ih, int block_size)
 {
-    reiserfs_warning (NULL, "green-16001: Invalid item type observed, run fsck ASAP");
-    return 0;
+	reiserfs_warning(NULL,
+			 "green-16001: Invalid item type observed, run fsck ASAP");
+	return 0;
 }
 
-static void errcatch_decrement_key (struct cpu_key * key)
+static void errcatch_decrement_key(struct cpu_key *key)
 {
-    reiserfs_warning (NULL, "green-16002: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL,
+			 "green-16002: Invalid item type observed, run fsck ASAP");
 }
 
-
-static int errcatch_is_left_mergeable (struct reiserfs_key * key, unsigned long bsize)
+static int errcatch_is_left_mergeable(struct reiserfs_key *key,
+				      unsigned long bsize)
 {
-    reiserfs_warning (NULL, "green-16003: Invalid item type observed, run fsck ASAP");
-    return 0;
+	reiserfs_warning(NULL,
+			 "green-16003: Invalid item type observed, run fsck ASAP");
+	return 0;
 }
 
-
-static void errcatch_print_item (struct item_head * ih, char * item)
+static void errcatch_print_item(struct item_head *ih, char *item)
 {
-    reiserfs_warning (NULL, "green-16004: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL,
+			 "green-16004: Invalid item type observed, run fsck ASAP");
 }
 
-
-static void errcatch_check_item (struct item_head * ih, char * item)
+static void errcatch_check_item(struct item_head *ih, char *item)
 {
-    reiserfs_warning (NULL, "green-16005: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL,
+			 "green-16005: Invalid item type observed, run fsck ASAP");
 }
 
-static int errcatch_create_vi (struct virtual_node * vn,
-			       struct virtual_item * vi, 
-			       int is_affected, 
-			       int insert_size)
+static int errcatch_create_vi(struct virtual_node *vn,
+			      struct virtual_item *vi,
+			      int is_affected, int insert_size)
 {
-    reiserfs_warning (NULL, "green-16006: Invalid item type observed, run fsck ASAP");
-    return 0;	// We might return -1 here as well, but it won't help as create_virtual_node() from where
-		// this operation is called from is of return type void.
+	reiserfs_warning(NULL,
+			 "green-16006: Invalid item type observed, run fsck ASAP");
+	return 0;		// We might return -1 here as well, but it won't help as create_virtual_node() from where
+	// this operation is called from is of return type void.
 }
 
-static int errcatch_check_left (struct virtual_item * vi, int free,
-				int start_skip, int end_skip)
+static int errcatch_check_left(struct virtual_item *vi, int free,
+			       int start_skip, int end_skip)
 {
-    reiserfs_warning (NULL, "green-16007: Invalid item type observed, run fsck ASAP");
-    return -1;
+	reiserfs_warning(NULL,
+			 "green-16007: Invalid item type observed, run fsck ASAP");
+	return -1;
 }
 
-
-static int errcatch_check_right (struct virtual_item * vi, int free)
+static int errcatch_check_right(struct virtual_item *vi, int free)
 {
-    reiserfs_warning (NULL, "green-16008: Invalid item type observed, run fsck ASAP");
-    return -1;
+	reiserfs_warning(NULL,
+			 "green-16008: Invalid item type observed, run fsck ASAP");
+	return -1;
 }
 
-static int errcatch_part_size (struct virtual_item * vi, int first, int count)
+static int errcatch_part_size(struct virtual_item *vi, int first, int count)
 {
-    reiserfs_warning (NULL, "green-16009: Invalid item type observed, run fsck ASAP");
-    return 0;
+	reiserfs_warning(NULL,
+			 "green-16009: Invalid item type observed, run fsck ASAP");
+	return 0;
 }
 
-static int errcatch_unit_num (struct virtual_item * vi)
+static int errcatch_unit_num(struct virtual_item *vi)
 {
-    reiserfs_warning (NULL, "green-16010: Invalid item type observed, run fsck ASAP");
-    return 0;
+	reiserfs_warning(NULL,
+			 "green-16010: Invalid item type observed, run fsck ASAP");
+	return 0;
 }
 
-static void errcatch_print_vi (struct virtual_item * vi)
+static void errcatch_print_vi(struct virtual_item *vi)
 {
-    reiserfs_warning (NULL, "green-16011: Invalid item type observed, run fsck ASAP");
+	reiserfs_warning(NULL,
+			 "green-16011: Invalid item type observed, run fsck ASAP");
 }
 
 static struct item_operations errcatch_ops = {
-    errcatch_bytes_number,
-    errcatch_decrement_key,
-    errcatch_is_left_mergeable,
-    errcatch_print_item,
-    errcatch_check_item,
-
-    errcatch_create_vi,
-    errcatch_check_left,
-    errcatch_check_right,
-    errcatch_part_size,
-    errcatch_unit_num,
-    errcatch_print_vi
+	errcatch_bytes_number,
+	errcatch_decrement_key,
+	errcatch_is_left_mergeable,
+	errcatch_print_item,
+	errcatch_check_item,
+
+	errcatch_create_vi,
+	errcatch_check_left,
+	errcatch_check_right,
+	errcatch_part_size,
+	errcatch_unit_num,
+	errcatch_print_vi
 };
 
-
-
 //////////////////////////////////////////////////////////////////////////////
 //
 //
@@ -775,15 +744,11 @@ static struct item_operations errcatch_ops = {
 #error Item types must use disk-format assigned values.
 #endif
 
-struct item_operations * item_ops [TYPE_ANY + 1] = {
-  &stat_data_ops,
-  &indirect_ops,
-  &direct_ops,
-  &direntry_ops,
-  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-  &errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
+struct item_operations *item_ops[TYPE_ANY + 1] = {
+	&stat_data_ops,
+	&indirect_ops,
+	&direct_ops,
+	&direntry_ops,
+	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+	&errcatch_ops		/* This is to catch errors with invalid type (15th entry for TYPE_ANY) */
 };
-
-
-
-
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index d1bcf0da6728..c66c27ec4100 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -55,7 +55,6 @@
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 
-
 /* gets a struct reiserfs_journal_list * from a list head */
 #define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
                                j_list))
@@ -69,55 +68,61 @@ static int reiserfs_mounted_fs_count;
 
 static struct workqueue_struct *commit_wq;
 
-#define JOURNAL_TRANS_HALF 1018   /* must be correct to keep the desc and commit
-				     structs at 4k */
-#define BUFNR 64 /*read ahead */
+#define JOURNAL_TRANS_HALF 1018	/* must be correct to keep the desc and commit
+				   structs at 4k */
+#define BUFNR 64		/*read ahead */
 
 /* cnode stat bits.  Move these into reiserfs_fs.h */
 
 #define BLOCK_FREED 2		/* this block was freed, and can't be written.  */
-#define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
+#define BLOCK_FREED_HOLDER 3	/* this block was freed during this transaction, and can't be written */
 
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
 #define BLOCK_DIRTIED 5
 
-
 /* journal list state bits */
 #define LIST_TOUCHED 1
 #define LIST_DIRTY   2
-#define LIST_COMMIT_PENDING  4		/* someone will commit this list */
+#define LIST_COMMIT_PENDING  4	/* someone will commit this list */
 
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
 #define COMMIT_NOW  2		/* end and commit this transaction */
-#define WAIT        4		/* wait for the log blocks to hit the disk*/
-
-static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
-static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
-static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall)  ;
-static int can_dirty(struct reiserfs_journal_cnode *cn) ;
-static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
-static int release_journal_dev( struct super_block *super,
-				struct reiserfs_journal *journal );
+#define WAIT        4		/* wait for the log blocks to hit the disk */
+
+static int do_journal_end(struct reiserfs_transaction_handle *,
+			  struct super_block *, unsigned long nblocks,
+			  int flags);
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall);
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall);
+static int can_dirty(struct reiserfs_journal_cnode *cn);
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *p_s_sb, unsigned long nblocks);
+static int release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
-                                 struct reiserfs_journal_list *jl);
+				 struct reiserfs_journal_list *jl);
 static void flush_async_commits(void *p);
 static void queue_log_writer(struct super_block *s);
 
 /* values for join in do_journal_begin_r */
 enum {
-    JBEGIN_REG = 0, /* regular journal begin */
-    JBEGIN_JOIN = 1, /* join the running transaction if at all possible */
-    JBEGIN_ABORT = 2, /* called from cleanup code, ignores aborted flag */
+	JBEGIN_REG = 0,		/* regular journal begin */
+	JBEGIN_JOIN = 1,	/* join the running transaction if at all possible */
+	JBEGIN_ABORT = 2,	/* called from cleanup code, ignores aborted flag */
 };
 
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
-                             struct super_block * p_s_sb,
-			     unsigned long nblocks,int join);
+			      struct super_block *p_s_sb,
+			      unsigned long nblocks, int join);
 
-static void init_journal_hash(struct super_block *p_s_sb) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  memset(journal->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
+static void init_journal_hash(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	memset(journal->j_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
 }
 
 /*
@@ -125,149 +130,159 @@ static void init_journal_hash(struct super_block *p_s_sb) {
 ** make schedule happen after I've freed a block.  Look at remove_from_transaction and journal_mark_freed for
 ** more details.
 */
-static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh) {
-    clear_buffer_dirty(bh);
-    clear_buffer_journal_test(bh);
-  }
-  return 0 ;
+static int reiserfs_clean_and_file_buffer(struct buffer_head *bh)
+{
+	if (bh) {
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+	}
+	return 0;
 }
 
 static void disable_barrier(struct super_block *s)
 {
-    REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
-    printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s));
-}
-
-static struct reiserfs_bitmap_node *
-allocate_bitmap_node(struct super_block *p_s_sb) {
-  struct reiserfs_bitmap_node *bn ;
-  static int id;
-
-  bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, p_s_sb) ;
-  if (!bn) {
-    return NULL ;
-  }
-  bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb) ;
-  if (!bn->data) {
-    reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
-    return NULL ;
-  }
-  bn->id = id++ ;
-  memset(bn->data, 0, p_s_sb->s_blocksize) ;
-  INIT_LIST_HEAD(&bn->list) ;
-  return bn ;
-}
-
-static struct reiserfs_bitmap_node *
-get_bitmap_node(struct super_block *p_s_sb) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_bitmap_node *bn = NULL;
-  struct list_head *entry = journal->j_bitmap_nodes.next ;
-
-  journal->j_used_bitmap_nodes++ ;
-repeat:
-
-  if(entry != &journal->j_bitmap_nodes) {
-    bn = list_entry(entry, struct reiserfs_bitmap_node, list) ;
-    list_del(entry) ;
-    memset(bn->data, 0, p_s_sb->s_blocksize) ;
-    journal->j_free_bitmap_nodes-- ;
-    return bn ;
-  }
-  bn = allocate_bitmap_node(p_s_sb) ;
-  if (!bn) {
-    yield();
-    goto repeat ;
-  }
-  return bn ;
+	REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH);
+	printk("reiserfs: disabling flush barriers on %s\n",
+	       reiserfs_bdevname(s));
+}
+
+static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block
+							 *p_s_sb)
+{
+	struct reiserfs_bitmap_node *bn;
+	static int id;
+
+	bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS,
+			      p_s_sb);
+	if (!bn) {
+		return NULL;
+	}
+	bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb);
+	if (!bn->data) {
+		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+		return NULL;
+	}
+	bn->id = id++;
+	memset(bn->data, 0, p_s_sb->s_blocksize);
+	INIT_LIST_HEAD(&bn->list);
+	return bn;
+}
+
+static struct reiserfs_bitmap_node *get_bitmap_node(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	struct list_head *entry = journal->j_bitmap_nodes.next;
+
+	journal->j_used_bitmap_nodes++;
+      repeat:
+
+	if (entry != &journal->j_bitmap_nodes) {
+		bn = list_entry(entry, struct reiserfs_bitmap_node, list);
+		list_del(entry);
+		memset(bn->data, 0, p_s_sb->s_blocksize);
+		journal->j_free_bitmap_nodes--;
+		return bn;
+	}
+	bn = allocate_bitmap_node(p_s_sb);
+	if (!bn) {
+		yield();
+		goto repeat;
+	}
+	return bn;
 }
 static inline void free_bitmap_node(struct super_block *p_s_sb,
-                                    struct reiserfs_bitmap_node *bn) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  journal->j_used_bitmap_nodes-- ;
-  if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
-    reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
-    reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
-  } else {
-    list_add(&bn->list, &journal->j_bitmap_nodes) ;
-    journal->j_free_bitmap_nodes++ ;
-  }
-}
-
-static void allocate_bitmap_nodes(struct super_block *p_s_sb) {
-  int i ;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_bitmap_node *bn = NULL ;
-  for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) {
-    bn = allocate_bitmap_node(p_s_sb) ;
-    if (bn) {
-      list_add(&bn->list, &journal->j_bitmap_nodes) ;
-      journal->j_free_bitmap_nodes++ ;
-    } else {
-      break ; // this is ok, we'll try again when more are needed 
-    }
-  }
+				    struct reiserfs_bitmap_node *bn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	journal->j_used_bitmap_nodes--;
+	if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) {
+		reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb);
+		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+	} else {
+		list_add(&bn->list, &journal->j_bitmap_nodes);
+		journal->j_free_bitmap_nodes++;
+	}
+}
+
+static void allocate_bitmap_nodes(struct super_block *p_s_sb)
+{
+	int i;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_bitmap_node *bn = NULL;
+	for (i = 0; i < REISERFS_MIN_BITMAP_NODES; i++) {
+		bn = allocate_bitmap_node(p_s_sb);
+		if (bn) {
+			list_add(&bn->list, &journal->j_bitmap_nodes);
+			journal->j_free_bitmap_nodes++;
+		} else {
+			break;	// this is ok, we'll try again when more are needed 
+		}
+	}
 }
 
 static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block,
-                                  struct reiserfs_list_bitmap *jb) {
-  int bmap_nr = block / (p_s_sb->s_blocksize << 3) ;
-  int bit_nr = block % (p_s_sb->s_blocksize << 3) ;
+				  struct reiserfs_list_bitmap *jb)
+{
+	int bmap_nr = block / (p_s_sb->s_blocksize << 3);
+	int bit_nr = block % (p_s_sb->s_blocksize << 3);
 
-  if (!jb->bitmaps[bmap_nr]) {
-    jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ;
-  }
-  set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data) ;
-  return 0 ;
+	if (!jb->bitmaps[bmap_nr]) {
+		jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb);
+	}
+	set_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data);
+	return 0;
 }
 
 static void cleanup_bitmap_list(struct super_block *p_s_sb,
-                                struct reiserfs_list_bitmap *jb) {
-  int i;
-  if (jb->bitmaps == NULL)
-    return;
-
-  for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) {
-    if (jb->bitmaps[i]) {
-      free_bitmap_node(p_s_sb, jb->bitmaps[i]) ;
-      jb->bitmaps[i] = NULL ;
-    }
-  }
+				struct reiserfs_list_bitmap *jb)
+{
+	int i;
+	if (jb->bitmaps == NULL)
+		return;
+
+	for (i = 0; i < SB_BMAP_NR(p_s_sb); i++) {
+		if (jb->bitmaps[i]) {
+			free_bitmap_node(p_s_sb, jb->bitmaps[i]);
+			jb->bitmaps[i] = NULL;
+		}
+	}
 }
 
 /*
 ** only call this on FS unmount.
 */
 static int free_list_bitmaps(struct super_block *p_s_sb,
-                             struct reiserfs_list_bitmap *jb_array) {
-  int i ;
-  struct reiserfs_list_bitmap *jb ;
-  for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
-    jb = jb_array + i ;
-    jb->journal_list = NULL ;
-    cleanup_bitmap_list(p_s_sb, jb) ;
-    vfree(jb->bitmaps) ;
-    jb->bitmaps = NULL ;
-  }
-  return 0;
-}
-
-static int free_bitmap_nodes(struct super_block *p_s_sb) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct list_head *next = journal->j_bitmap_nodes.next ;
-  struct reiserfs_bitmap_node *bn ;
-
-  while(next != &journal->j_bitmap_nodes) {
-    bn = list_entry(next, struct reiserfs_bitmap_node, list) ;
-    list_del(next) ;
-    reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb) ;
-    reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb) ;
-    next = journal->j_bitmap_nodes.next ;
-    journal->j_free_bitmap_nodes-- ;
-  }
-
-  return 0 ;
+			     struct reiserfs_list_bitmap *jb_array)
+{
+	int i;
+	struct reiserfs_list_bitmap *jb;
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		cleanup_bitmap_list(p_s_sb, jb);
+		vfree(jb->bitmaps);
+		jb->bitmaps = NULL;
+	}
+	return 0;
+}
+
+static int free_bitmap_nodes(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct list_head *next = journal->j_bitmap_nodes.next;
+	struct reiserfs_bitmap_node *bn;
+
+	while (next != &journal->j_bitmap_nodes) {
+		bn = list_entry(next, struct reiserfs_bitmap_node, list);
+		list_del(next);
+		reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb);
+		reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb);
+		next = journal->j_bitmap_nodes.next;
+		journal->j_free_bitmap_nodes--;
+	}
+
+	return 0;
 }
 
 /*
@@ -275,59 +290,65 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) {
 ** jb_array is the array to be filled in.
 */
 int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb,
-                                   struct reiserfs_list_bitmap *jb_array,
-				   int bmap_nr) {
-  int i ;
-  int failed = 0 ;
-  struct reiserfs_list_bitmap *jb ;
-  int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ;
-
-  for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
-    jb = jb_array + i ;
-    jb->journal_list = NULL ;
-    jb->bitmaps = vmalloc( mem ) ;
-    if (!jb->bitmaps) {
-      reiserfs_warning(p_s_sb, "clm-2000, unable to allocate bitmaps for journal lists") ;
-      failed = 1;   
-      break ;
-    }
-    memset(jb->bitmaps, 0, mem) ;
-  }
-  if (failed) {
-    free_list_bitmaps(p_s_sb, jb_array) ;
-    return -1 ;
-  }
-  return 0 ;
+				   struct reiserfs_list_bitmap *jb_array,
+				   int bmap_nr)
+{
+	int i;
+	int failed = 0;
+	struct reiserfs_list_bitmap *jb;
+	int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *);
+
+	for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+		jb = jb_array + i;
+		jb->journal_list = NULL;
+		jb->bitmaps = vmalloc(mem);
+		if (!jb->bitmaps) {
+			reiserfs_warning(p_s_sb,
+					 "clm-2000, unable to allocate bitmaps for journal lists");
+			failed = 1;
+			break;
+		}
+		memset(jb->bitmaps, 0, mem);
+	}
+	if (failed) {
+		free_list_bitmaps(p_s_sb, jb_array);
+		return -1;
+	}
+	return 0;
 }
 
 /*
 ** find an available list bitmap.  If you can't find one, flush a commit list 
 ** and try again
 */
-static struct reiserfs_list_bitmap *
-get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
-  int i,j ; 
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_list_bitmap *jb = NULL ;
-
-  for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) {
-    i = journal->j_list_bitmap_index ;
-    journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ;
-    jb = journal->j_list_bitmap + i ;
-    if (journal->j_list_bitmap[i].journal_list) {
-      flush_commit_list(p_s_sb, journal->j_list_bitmap[i].journal_list, 1) ;
-      if (!journal->j_list_bitmap[i].journal_list) {
-	break ;
-      }
-    } else {
-      break ;
-    }
-  }
-  if (jb->journal_list) { /* double check to make sure if flushed correctly */
-    return NULL ;
-  }
-  jb->journal_list = jl ;
-  return jb ;
+static struct reiserfs_list_bitmap *get_list_bitmap(struct super_block *p_s_sb,
+						    struct reiserfs_journal_list
+						    *jl)
+{
+	int i, j;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_list_bitmap *jb = NULL;
+
+	for (j = 0; j < (JOURNAL_NUM_BITMAPS * 3); j++) {
+		i = journal->j_list_bitmap_index;
+		journal->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS;
+		jb = journal->j_list_bitmap + i;
+		if (journal->j_list_bitmap[i].journal_list) {
+			flush_commit_list(p_s_sb,
+					  journal->j_list_bitmap[i].
+					  journal_list, 1);
+			if (!journal->j_list_bitmap[i].journal_list) {
+				break;
+			}
+		} else {
+			break;
+		}
+	}
+	if (jb->journal_list) {	/* double check to make sure if flushed correctly */
+		return NULL;
+	}
+	jb->journal_list = jl;
+	return jb;
 }
 
 /* 
@@ -335,104 +356,114 @@ get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
 ** Uses the cnode->next and cnode->prev pointers
 ** returns NULL on failure
 */
-static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) {
-  struct reiserfs_journal_cnode *head ;
-  int i ;
-  if (num_cnodes <= 0) {
-    return NULL ;
-  }
-  head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
-  if (!head) {
-    return NULL ;
-  }
-  memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ;
-  head[0].prev = NULL ;
-  head[0].next = head + 1 ;
-  for (i = 1 ; i < num_cnodes; i++) {
-    head[i].prev = head + (i - 1) ;
-    head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */
-  }
-  head[num_cnodes -1].next = NULL ;
-  return head ;
+static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes)
+{
+	struct reiserfs_journal_cnode *head;
+	int i;
+	if (num_cnodes <= 0) {
+		return NULL;
+	}
+	head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	if (!head) {
+		return NULL;
+	}
+	memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode));
+	head[0].prev = NULL;
+	head[0].next = head + 1;
+	for (i = 1; i < num_cnodes; i++) {
+		head[i].prev = head + (i - 1);
+		head[i].next = head + (i + 1);	/* if last one, overwrite it after the if */
+	}
+	head[num_cnodes - 1].next = NULL;
+	return head;
 }
 
 /*
 ** pulls a cnode off the free list, or returns NULL on failure 
 */
-static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) {
-  struct reiserfs_journal_cnode *cn ;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-
-  reiserfs_check_lock_depth(p_s_sb, "get_cnode") ;
-
-  if (journal->j_cnode_free <= 0) {
-    return NULL ;
-  }
-  journal->j_cnode_used++ ;
-  journal->j_cnode_free-- ;
-  cn = journal->j_cnode_free_list ;
-  if (!cn) {
-    return cn ;
-  }
-  if (cn->next) {
-    cn->next->prev = NULL ;
-  }
-  journal->j_cnode_free_list = cn->next ;
-  memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ;
-  return cn ;
+static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+
+	reiserfs_check_lock_depth(p_s_sb, "get_cnode");
+
+	if (journal->j_cnode_free <= 0) {
+		return NULL;
+	}
+	journal->j_cnode_used++;
+	journal->j_cnode_free--;
+	cn = journal->j_cnode_free_list;
+	if (!cn) {
+		return cn;
+	}
+	if (cn->next) {
+		cn->next->prev = NULL;
+	}
+	journal->j_cnode_free_list = cn->next;
+	memset(cn, 0, sizeof(struct reiserfs_journal_cnode));
+	return cn;
 }
 
 /*
 ** returns a cnode to the free list 
 */
-static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+static void free_cnode(struct super_block *p_s_sb,
+		       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 
-  reiserfs_check_lock_depth(p_s_sb, "free_cnode") ;
+	reiserfs_check_lock_depth(p_s_sb, "free_cnode");
 
-  journal->j_cnode_used-- ;
-  journal->j_cnode_free++ ;
-  /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
-  cn->next = journal->j_cnode_free_list ;
-  if (journal->j_cnode_free_list) {
-    journal->j_cnode_free_list->prev = cn ;
-  }
-  cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */
-  journal->j_cnode_free_list = cn ;
+	journal->j_cnode_used--;
+	journal->j_cnode_free++;
+	/* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */
+	cn->next = journal->j_cnode_free_list;
+	if (journal->j_cnode_free_list) {
+		journal->j_cnode_free_list->prev = cn;
+	}
+	cn->prev = NULL;	/* not needed with the memset, but I might kill the memset, and forget to do this */
+	journal->j_cnode_free_list = cn;
 }
 
-static void clear_prepared_bits(struct buffer_head *bh) {
-  clear_buffer_journal_prepared (bh);
-  clear_buffer_journal_restore_dirty (bh);
+static void clear_prepared_bits(struct buffer_head *bh)
+{
+	clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
 }
 
 /* utility function to force a BUG if it is called without the big
 ** kernel lock held.  caller is the string printed just before calling BUG()
 */
-void reiserfs_check_lock_depth(struct super_block *sb, char *caller) {
+void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
+{
 #ifdef CONFIG_SMP
-  if (current->lock_depth < 0) {
-    reiserfs_panic (sb, "%s called without kernel lock held", caller) ;
-  }
+	if (current->lock_depth < 0) {
+		reiserfs_panic(sb, "%s called without kernel lock held",
+			       caller);
+	}
 #else
-  ;
+	;
 #endif
 }
 
 /* return a cnode with same dev, block number and size in table, or null if not found */
-static inline struct reiserfs_journal_cnode *
-get_journal_hash_dev(struct super_block *sb,
-		     struct reiserfs_journal_cnode **table,
-		     long bl)
+static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
+								  super_block
+								  *sb,
+								  struct
+								  reiserfs_journal_cnode
+								  **table,
+								  long bl)
 {
-  struct reiserfs_journal_cnode *cn ;
-  cn = journal_hash(table, sb, bl) ;
-  while(cn) {
-    if (cn->blocknr == bl && cn->sb == sb)
-      return cn ;
-    cn = cn->hnext ;
-  }
-  return (struct reiserfs_journal_cnode *)0 ;
+	struct reiserfs_journal_cnode *cn;
+	cn = journal_hash(table, sb, bl);
+	while (cn) {
+		if (cn->blocknr == bl && cn->sb == sb)
+			return cn;
+		cn = cn->hnext;
+	}
+	return (struct reiserfs_journal_cnode *)0;
 }
 
 /*
@@ -454,91 +485,103 @@ get_journal_hash_dev(struct super_block *sb,
 **
 */
 int reiserfs_in_journal(struct super_block *p_s_sb,
-                        int bmap_nr, int bit_nr, int search_all, 
-			b_blocknr_t *next_zero_bit) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_cnode *cn ;
-  struct reiserfs_list_bitmap *jb ;
-  int i ;
-  unsigned long bl;
-
-  *next_zero_bit = 0 ; /* always start this at zero. */
-
-  PROC_INFO_INC( p_s_sb, journal.in_journal );
-  /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
-  ** if we crash before the transaction that freed it commits,  this transaction won't
-  ** have committed either, and the block will never be written
-  */
-  if (search_all) {
-    for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
-      PROC_INFO_INC( p_s_sb, journal.in_journal_bitmap );
-      jb = journal->j_list_bitmap + i ;
-      if (jb->journal_list && jb->bitmaps[bmap_nr] &&
-          test_bit(bit_nr, (unsigned long *)jb->bitmaps[bmap_nr]->data)) {
-	*next_zero_bit = find_next_zero_bit((unsigned long *)
-	                             (jb->bitmaps[bmap_nr]->data),
-	                             p_s_sb->s_blocksize << 3, bit_nr+1) ; 
-	return 1 ;
-      }
-    }
-  }
-
-  bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
-  /* is it in any old transactions? */
-  if (search_all && (cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
-    return 1; 
-  }
-
-  /* is it in the current transaction.  This should never happen */
-  if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
-    BUG();
-    return 1; 
-  }
-
-  PROC_INFO_INC( p_s_sb, journal.in_journal_reusable );
-  /* safe for reuse */
-  return 0 ;
+			int bmap_nr, int bit_nr, int search_all,
+			b_blocknr_t * next_zero_bit)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_list_bitmap *jb;
+	int i;
+	unsigned long bl;
+
+	*next_zero_bit = 0;	/* always start this at zero. */
+
+	PROC_INFO_INC(p_s_sb, journal.in_journal);
+	/* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
+	 ** if we crash before the transaction that freed it commits,  this transaction won't
+	 ** have committed either, and the block will never be written
+	 */
+	if (search_all) {
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			PROC_INFO_INC(p_s_sb, journal.in_journal_bitmap);
+			jb = journal->j_list_bitmap + i;
+			if (jb->journal_list && jb->bitmaps[bmap_nr] &&
+			    test_bit(bit_nr,
+				     (unsigned long *)jb->bitmaps[bmap_nr]->
+				     data)) {
+				*next_zero_bit =
+				    find_next_zero_bit((unsigned long *)
+						       (jb->bitmaps[bmap_nr]->
+							data),
+						       p_s_sb->s_blocksize << 3,
+						       bit_nr + 1);
+				return 1;
+			}
+		}
+	}
+
+	bl = bmap_nr * (p_s_sb->s_blocksize << 3) + bit_nr;
+	/* is it in any old transactions? */
+	if (search_all
+	    && (cn =
+		get_journal_hash_dev(p_s_sb, journal->j_list_hash_table, bl))) {
+		return 1;
+	}
+
+	/* is it in the current transaction.  This should never happen */
+	if ((cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, bl))) {
+		BUG();
+		return 1;
+	}
+
+	PROC_INFO_INC(p_s_sb, journal.in_journal_reusable);
+	/* safe for reuse */
+	return 0;
 }
 
 /* insert cn into table
 */
-static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) {
-  struct reiserfs_journal_cnode *cn_orig ;
+static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
+				       struct reiserfs_journal_cnode *cn)
+{
+	struct reiserfs_journal_cnode *cn_orig;
 
-  cn_orig = journal_hash(table, cn->sb, cn->blocknr) ;
-  cn->hnext = cn_orig ;
-  cn->hprev = NULL ;
-  if (cn_orig) {
-    cn_orig->hprev = cn ;
-  }
-  journal_hash(table, cn->sb, cn->blocknr) =  cn ;
+	cn_orig = journal_hash(table, cn->sb, cn->blocknr);
+	cn->hnext = cn_orig;
+	cn->hprev = NULL;
+	if (cn_orig) {
+		cn_orig->hprev = cn;
+	}
+	journal_hash(table, cn->sb, cn->blocknr) = cn;
 }
 
 /* lock the current transaction */
-inline static void lock_journal(struct super_block *p_s_sb) {
-    PROC_INFO_INC( p_s_sb, journal.lock_journal );
-    down(&SB_JOURNAL(p_s_sb)->j_lock);
+inline static void lock_journal(struct super_block *p_s_sb)
+{
+	PROC_INFO_INC(p_s_sb, journal.lock_journal);
+	down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /* unlock the current transaction */
-inline static void unlock_journal(struct super_block *p_s_sb) {
-    up(&SB_JOURNAL(p_s_sb)->j_lock);
+inline static void unlock_journal(struct super_block *p_s_sb)
+{
+	up(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 static inline void get_journal_list(struct reiserfs_journal_list *jl)
 {
-    jl->j_refcount++;
+	jl->j_refcount++;
 }
 
 static inline void put_journal_list(struct super_block *s,
-                                   struct reiserfs_journal_list *jl)
+				    struct reiserfs_journal_list *jl)
 {
-    if (jl->j_refcount < 1) {
-        reiserfs_panic (s, "trans id %lu, refcount at %d", jl->j_trans_id,
-	                                         jl->j_refcount);
-    }
-    if (--jl->j_refcount == 0)
-        reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
+	if (jl->j_refcount < 1) {
+		reiserfs_panic(s, "trans id %lu, refcount at %d",
+			       jl->j_trans_id, jl->j_refcount);
+	}
+	if (--jl->j_refcount == 0)
+		reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
 }
 
 /*
@@ -546,358 +589,375 @@ static inline void put_journal_list(struct super_block *s,
 ** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a
 ** transaction.
 */
-static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) {
+static void cleanup_freed_for_journal_list(struct super_block *p_s_sb,
+					   struct reiserfs_journal_list *jl)
+{
 
-  struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ;
-  if (jb) {
-    cleanup_bitmap_list(p_s_sb, jb) ;
-  }
-  jl->j_list_bitmap->journal_list = NULL ;
-  jl->j_list_bitmap = NULL ;
+	struct reiserfs_list_bitmap *jb = jl->j_list_bitmap;
+	if (jb) {
+		cleanup_bitmap_list(p_s_sb, jb);
+	}
+	jl->j_list_bitmap->journal_list = NULL;
+	jl->j_list_bitmap = NULL;
 }
 
 static int journal_list_still_alive(struct super_block *s,
-                                    unsigned long trans_id)
-{
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    struct list_head *entry = &journal->j_journal_list;
-    struct reiserfs_journal_list *jl;
-
-    if (!list_empty(entry)) {
-        jl = JOURNAL_LIST_ENTRY(entry->next);
-	if (jl->j_trans_id <= trans_id) {
-	    return 1;
-	}
-    }
-    return 0;
-}
-
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
-    char b[BDEVNAME_SIZE];
-
-    if (buffer_journaled(bh)) {
-        reiserfs_warning(NULL, "clm-2084: pinned buffer %lu:%s sent to disk",
-	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
-    }
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-
-static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-
-static void submit_logged_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_buffer_io_sync ;
-    clear_buffer_journal_new (bh);
-    clear_buffer_dirty(bh) ;
-    if (!test_clear_buffer_journal_test (bh))
-        BUG();
-    if (!buffer_uptodate(bh))
-        BUG();
-    submit_bh(WRITE, bh) ;
-}
-
-static void submit_ordered_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_ordered_io;
-    clear_buffer_dirty(bh) ;
-    if (!buffer_uptodate(bh))
-        BUG();
-    submit_bh(WRITE, bh) ;
-}
-
-static int submit_barrier_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_ordered_io;
-    clear_buffer_dirty(bh) ;
-    if (!buffer_uptodate(bh))
-        BUG();
-    return submit_bh(WRITE_BARRIER, bh) ;
+				    unsigned long trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct list_head *entry = &journal->j_journal_list;
+	struct reiserfs_journal_list *jl;
+
+	if (!list_empty(entry)) {
+		jl = JOURNAL_LIST_ENTRY(entry->next);
+		if (jl->j_trans_id <= trans_id) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+{
+	char b[BDEVNAME_SIZE];
+
+	if (buffer_journaled(bh)) {
+		reiserfs_warning(NULL,
+				 "clm-2084: pinned buffer %lu:%s sent to disk",
+				 bh->b_blocknr, bdevname(bh->b_bdev, b));
+	}
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_buffer_uptodate(bh);
+	else
+		clear_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
+static void submit_logged_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_buffer_io_sync;
+	clear_buffer_journal_new(bh);
+	clear_buffer_dirty(bh);
+	if (!test_clear_buffer_journal_test(bh))
+		BUG();
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_ordered_io;
+	clear_buffer_dirty(bh);
+	if (!buffer_uptodate(bh))
+		BUG();
+	submit_bh(WRITE, bh);
+}
+
+static int submit_barrier_buffer(struct buffer_head *bh)
+{
+	get_bh(bh);
+	bh->b_end_io = reiserfs_end_ordered_io;
+	clear_buffer_dirty(bh);
+	if (!buffer_uptodate(bh))
+		BUG();
+	return submit_bh(WRITE_BARRIER, bh);
 }
 
 static void check_barrier_completion(struct super_block *s,
-                                     struct buffer_head *bh) {
-    if (buffer_eopnotsupp(bh)) {
-	clear_buffer_eopnotsupp(bh);
-	disable_barrier(s);
-	set_buffer_uptodate(bh);
-	set_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-    }
+				     struct buffer_head *bh)
+{
+	if (buffer_eopnotsupp(bh)) {
+		clear_buffer_eopnotsupp(bh);
+		disable_barrier(s);
+		set_buffer_uptodate(bh);
+		set_buffer_dirty(bh);
+		sync_dirty_buffer(bh);
+	}
 }
 
 #define CHUNK_SIZE 32
 struct buffer_chunk {
-    struct buffer_head *bh[CHUNK_SIZE];
-    int nr;
+	struct buffer_head *bh[CHUNK_SIZE];
+	int nr;
 };
 
-static void write_chunk(struct buffer_chunk *chunk) {
-    int i;
-    get_fs_excl();
-    for (i = 0; i < chunk->nr ; i++) {
-	submit_logged_buffer(chunk->bh[i]) ;
-    }
-    chunk->nr = 0;
-    put_fs_excl();
+static void write_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	get_fs_excl();
+	for (i = 0; i < chunk->nr; i++) {
+		submit_logged_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+	put_fs_excl();
 }
 
-static void write_ordered_chunk(struct buffer_chunk *chunk) {
-    int i;
-    get_fs_excl();
-    for (i = 0; i < chunk->nr ; i++) {
-	submit_ordered_buffer(chunk->bh[i]) ;
-    }
-    chunk->nr = 0;
-    put_fs_excl();
+static void write_ordered_chunk(struct buffer_chunk *chunk)
+{
+	int i;
+	get_fs_excl();
+	for (i = 0; i < chunk->nr; i++) {
+		submit_ordered_buffer(chunk->bh[i]);
+	}
+	chunk->nr = 0;
+	put_fs_excl();
 }
 
 static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
-			 spinlock_t *lock,
-			 void (fn)(struct buffer_chunk *))
+			spinlock_t * lock, void (fn) (struct buffer_chunk *))
 {
-    int ret = 0;
-    if (chunk->nr >= CHUNK_SIZE)
-        BUG();
-    chunk->bh[chunk->nr++] = bh;
-    if (chunk->nr >= CHUNK_SIZE) {
-	ret = 1;
-        if (lock)
-	    spin_unlock(lock);
-        fn(chunk);
-        if (lock)
-	    spin_lock(lock);
-    }
-    return ret;
+	int ret = 0;
+	if (chunk->nr >= CHUNK_SIZE)
+		BUG();
+	chunk->bh[chunk->nr++] = bh;
+	if (chunk->nr >= CHUNK_SIZE) {
+		ret = 1;
+		if (lock)
+			spin_unlock(lock);
+		fn(chunk);
+		if (lock)
+			spin_lock(lock);
+	}
+	return ret;
 }
 
-
 static atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
-static struct reiserfs_jh *alloc_jh(void) {
-    struct reiserfs_jh *jh;
-    while(1) {
-	jh = kmalloc(sizeof(*jh), GFP_NOFS);
-	if (jh) {
-	    atomic_inc(&nr_reiserfs_jh);
-	    return jh;
+static struct reiserfs_jh *alloc_jh(void)
+{
+	struct reiserfs_jh *jh;
+	while (1) {
+		jh = kmalloc(sizeof(*jh), GFP_NOFS);
+		if (jh) {
+			atomic_inc(&nr_reiserfs_jh);
+			return jh;
+		}
+		yield();
 	}
-        yield();
-    }
 }
 
 /*
  * we want to free the jh when the buffer has been written
  * and waited on
  */
-void reiserfs_free_jh(struct buffer_head *bh) {
-    struct reiserfs_jh *jh;
-
-    jh = bh->b_private;
-    if (jh) {
-	bh->b_private = NULL;
-	jh->bh = NULL;
-	list_del_init(&jh->list);
-	kfree(jh);
-	if (atomic_read(&nr_reiserfs_jh) <= 0)
-	    BUG();
-	atomic_dec(&nr_reiserfs_jh);
-	put_bh(bh);
-    }
+void reiserfs_free_jh(struct buffer_head *bh)
+{
+	struct reiserfs_jh *jh;
+
+	jh = bh->b_private;
+	if (jh) {
+		bh->b_private = NULL;
+		jh->bh = NULL;
+		list_del_init(&jh->list);
+		kfree(jh);
+		if (atomic_read(&nr_reiserfs_jh) <= 0)
+			BUG();
+		atomic_dec(&nr_reiserfs_jh);
+		put_bh(bh);
+	}
 }
 
 static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
-                           int tail)
+			   int tail)
 {
-    struct reiserfs_jh *jh;
+	struct reiserfs_jh *jh;
 
-    if (bh->b_private) {
-	spin_lock(&j->j_dirty_buffers_lock);
-	if (!bh->b_private) {
-	    spin_unlock(&j->j_dirty_buffers_lock);
-	    goto no_jh;
+	if (bh->b_private) {
+		spin_lock(&j->j_dirty_buffers_lock);
+		if (!bh->b_private) {
+			spin_unlock(&j->j_dirty_buffers_lock);
+			goto no_jh;
+		}
+		jh = bh->b_private;
+		list_del_init(&jh->list);
+	} else {
+	      no_jh:
+		get_bh(bh);
+		jh = alloc_jh();
+		spin_lock(&j->j_dirty_buffers_lock);
+		/* buffer must be locked for __add_jh, should be able to have
+		 * two adds at the same time
+		 */
+		if (bh->b_private)
+			BUG();
+		jh->bh = bh;
+		bh->b_private = jh;
 	}
-        jh = bh->b_private;
-	list_del_init(&jh->list);
-    } else {
-no_jh:
-	get_bh(bh);
-	jh = alloc_jh();
-	spin_lock(&j->j_dirty_buffers_lock);
-	/* buffer must be locked for __add_jh, should be able to have
-	 * two adds at the same time
-	 */
-	if (bh->b_private)
-	    BUG();
-	jh->bh = bh;
-	bh->b_private = jh;
-    }
-    jh->jl = j->j_current_jl;
-    if (tail)
-	list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
-    else {
-	list_add_tail(&jh->list, &jh->jl->j_bh_list);
-    }
-    spin_unlock(&j->j_dirty_buffers_lock);
-    return 0;
+	jh->jl = j->j_current_jl;
+	if (tail)
+		list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+	else {
+		list_add_tail(&jh->list, &jh->jl->j_bh_list);
+	}
+	spin_unlock(&j->j_dirty_buffers_lock);
+	return 0;
 }
 
-int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
-    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
 }
-int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
-    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh)
+{
+	return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
 }
 
 #define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
-static int write_ordered_buffers(spinlock_t *lock,
+static int write_ordered_buffers(spinlock_t * lock,
 				 struct reiserfs_journal *j,
-                                 struct reiserfs_journal_list *jl,
+				 struct reiserfs_journal_list *jl,
 				 struct list_head *list)
 {
-    struct buffer_head *bh;
-    struct reiserfs_jh *jh;
-    int ret = j->j_errno;
-    struct buffer_chunk chunk;
-    struct list_head tmp;
-    INIT_LIST_HEAD(&tmp);
-
-    chunk.nr = 0;
-    spin_lock(lock);
-    while(!list_empty(list)) {
-        jh = JH_ENTRY(list->next);
-	bh = jh->bh;
-	get_bh(bh);
-	if (test_set_buffer_locked(bh)) {
-	    if (!buffer_dirty(bh)) {
-		list_del_init(&jh->list);
-		list_add(&jh->list, &tmp);
-		goto loop_next;
-	    }
-	    spin_unlock(lock);
-	    if (chunk.nr)
+	struct buffer_head *bh;
+	struct reiserfs_jh *jh;
+	int ret = j->j_errno;
+	struct buffer_chunk chunk;
+	struct list_head tmp;
+	INIT_LIST_HEAD(&tmp);
+
+	chunk.nr = 0;
+	spin_lock(lock);
+	while (!list_empty(list)) {
+		jh = JH_ENTRY(list->next);
+		bh = jh->bh;
+		get_bh(bh);
+		if (test_set_buffer_locked(bh)) {
+			if (!buffer_dirty(bh)) {
+				list_del_init(&jh->list);
+				list_add(&jh->list, &tmp);
+				goto loop_next;
+			}
+			spin_unlock(lock);
+			if (chunk.nr)
+				write_ordered_chunk(&chunk);
+			wait_on_buffer(bh);
+			cond_resched();
+			spin_lock(lock);
+			goto loop_next;
+		}
+		if (buffer_dirty(bh)) {
+			list_del_init(&jh->list);
+			list_add(&jh->list, &tmp);
+			add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+		} else {
+			reiserfs_free_jh(bh);
+			unlock_buffer(bh);
+		}
+	      loop_next:
+		put_bh(bh);
+		cond_resched_lock(lock);
+	}
+	if (chunk.nr) {
+		spin_unlock(lock);
 		write_ordered_chunk(&chunk);
-	    wait_on_buffer(bh);
-	    cond_resched();
-	    spin_lock(lock);
-	    goto loop_next;
-        }
-	if (buffer_dirty(bh)) {
-	    list_del_init(&jh->list);
-	    list_add(&jh->list, &tmp);
-            add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
-	} else {
-	    reiserfs_free_jh(bh);
-	    unlock_buffer(bh);
+		spin_lock(lock);
 	}
-loop_next:
-	put_bh(bh);
-	cond_resched_lock(lock);
-    }
-    if (chunk.nr) {
-	spin_unlock(lock);
-        write_ordered_chunk(&chunk);
-	spin_lock(lock);
-    }
-    while(!list_empty(&tmp)) {
-        jh = JH_ENTRY(tmp.prev);
-	bh = jh->bh;
-	get_bh(bh);
-	reiserfs_free_jh(bh);
-
-	if (buffer_locked(bh)) {
-	    spin_unlock(lock);
-	    wait_on_buffer(bh);
-	    spin_lock(lock);
+	while (!list_empty(&tmp)) {
+		jh = JH_ENTRY(tmp.prev);
+		bh = jh->bh;
+		get_bh(bh);
+		reiserfs_free_jh(bh);
+
+		if (buffer_locked(bh)) {
+			spin_unlock(lock);
+			wait_on_buffer(bh);
+			spin_lock(lock);
+		}
+		if (!buffer_uptodate(bh)) {
+			ret = -EIO;
+		}
+		put_bh(bh);
+		cond_resched_lock(lock);
 	}
-	if (!buffer_uptodate(bh)) {
-	    ret = -EIO;
-        }
-	put_bh(bh);
-	cond_resched_lock(lock);
-    }
-    spin_unlock(lock);
-    return ret;
-}
-
-static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    struct reiserfs_journal_list *other_jl;
-    struct reiserfs_journal_list *first_jl;
-    struct list_head *entry;
-    unsigned long trans_id = jl->j_trans_id;
-    unsigned long other_trans_id;
-    unsigned long first_trans_id;
-
-find_first:
-    /*
-     * first we walk backwards to find the oldest uncommitted transation
-     */
-    first_jl = jl;
-    entry = jl->j_list.prev;
-    while(1) {
-	other_jl = JOURNAL_LIST_ENTRY(entry);
-	if (entry == &journal->j_journal_list ||
-	    atomic_read(&other_jl->j_older_commits_done))
-	    break;
-
-        first_jl = other_jl;
-	entry = other_jl->j_list.prev;
-    }
-
-    /* if we didn't find any older uncommitted transactions, return now */
-    if (first_jl == jl) {
-        return 0;
-    }
-
-    first_trans_id = first_jl->j_trans_id;
+	spin_unlock(lock);
+	return ret;
+}
 
-    entry = &first_jl->j_list;
-    while(1) {
-	other_jl = JOURNAL_LIST_ENTRY(entry);
-	other_trans_id = other_jl->j_trans_id;
+static int flush_older_commits(struct super_block *s,
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal_list *first_jl;
+	struct list_head *entry;
+	unsigned long trans_id = jl->j_trans_id;
+	unsigned long other_trans_id;
+	unsigned long first_trans_id;
+
+      find_first:
+	/*
+	 * first we walk backwards to find the oldest uncommitted transation
+	 */
+	first_jl = jl;
+	entry = jl->j_list.prev;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		if (entry == &journal->j_journal_list ||
+		    atomic_read(&other_jl->j_older_commits_done))
+			break;
 
-	if (other_trans_id < trans_id) {
-	    if (atomic_read(&other_jl->j_commit_left) != 0) {
-		flush_commit_list(s, other_jl, 0);
+		first_jl = other_jl;
+		entry = other_jl->j_list.prev;
+	}
 
-		/* list we were called with is gone, return */
-		if (!journal_list_still_alive(s, trans_id))
-		    return 1;
+	/* if we didn't find any older uncommitted transactions, return now */
+	if (first_jl == jl) {
+		return 0;
+	}
 
-		/* the one we just flushed is gone, this means all
-		 * older lists are also gone, so first_jl is no longer
-		 * valid either.  Go back to the beginning.
-		 */
-		if (!journal_list_still_alive(s, other_trans_id)) {
-		    goto find_first;
+	first_trans_id = first_jl->j_trans_id;
+
+	entry = &first_jl->j_list;
+	while (1) {
+		other_jl = JOURNAL_LIST_ENTRY(entry);
+		other_trans_id = other_jl->j_trans_id;
+
+		if (other_trans_id < trans_id) {
+			if (atomic_read(&other_jl->j_commit_left) != 0) {
+				flush_commit_list(s, other_jl, 0);
+
+				/* list we were called with is gone, return */
+				if (!journal_list_still_alive(s, trans_id))
+					return 1;
+
+				/* the one we just flushed is gone, this means all
+				 * older lists are also gone, so first_jl is no longer
+				 * valid either.  Go back to the beginning.
+				 */
+				if (!journal_list_still_alive
+				    (s, other_trans_id)) {
+					goto find_first;
+				}
+			}
+			entry = entry->next;
+			if (entry == &journal->j_journal_list)
+				return 0;
+		} else {
+			return 0;
 		}
-	    }
-	    entry = entry->next;
-	    if (entry == &journal->j_journal_list)
-		return 0;
-	} else {
-	    return 0;
 	}
-    }
-    return 0;
+	return 0;
 }
-int reiserfs_async_progress_wait(struct super_block *s) {
-    DEFINE_WAIT(wait);
-    struct reiserfs_journal *j = SB_JOURNAL(s);
-    if (atomic_read(&j->j_async_throttle))
-    	blk_congestion_wait(WRITE, HZ/10);
-    return 0;
+int reiserfs_async_progress_wait(struct super_block *s)
+{
+	DEFINE_WAIT(wait);
+	struct reiserfs_journal *j = SB_JOURNAL(s);
+	if (atomic_read(&j->j_async_throttle))
+		blk_congestion_wait(WRITE, HZ / 10);
+	return 0;
 }
 
 /*
@@ -907,212 +967,225 @@ int reiserfs_async_progress_wait(struct super_block *s) {
 ** Before the commit block can by written, every other log block must be safely on disk
 **
 */
-static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
-  int i;
-  int bn ;
-  struct buffer_head *tbh = NULL ;
-  unsigned long trans_id = jl->j_trans_id;
-  struct reiserfs_journal *journal = SB_JOURNAL (s);
-  int barrier = 0;
-  int retval = 0;
-
-  reiserfs_check_lock_depth(s, "flush_commit_list") ;
-
-  if (atomic_read(&jl->j_older_commits_done)) {
-    return 0 ;
-  }
-
-  get_fs_excl();
-
-  /* before we can put our commit blocks on disk, we have to make sure everyone older than
-  ** us is on disk too
-  */
-  BUG_ON (jl->j_len <= 0);
-  BUG_ON (trans_id == journal->j_trans_id);
-
-  get_journal_list(jl);
-  if (flushall) {
-    if (flush_older_commits(s, jl) == 1) {
-      /* list disappeared during flush_older_commits.  return */
-      goto put_jl;
-    }
-  }
-
-  /* make sure nobody is trying to flush this one at the same time */
-  down(&jl->j_commit_lock);
-  if (!journal_list_still_alive(s, trans_id)) {
-    up(&jl->j_commit_lock);
-    goto put_jl;
-  }
-  BUG_ON (jl->j_trans_id == 0);
-
-  /* this commit is done, exit */
-  if (atomic_read(&(jl->j_commit_left)) <= 0) {
-    if (flushall) {
-      atomic_set(&(jl->j_older_commits_done), 1) ;
-    }
-    up(&jl->j_commit_lock);
-    goto put_jl;
-  }
-
-  if (!list_empty(&jl->j_bh_list)) {
-      unlock_kernel();
-      write_ordered_buffers(&journal->j_dirty_buffers_lock,
-                            journal, jl, &jl->j_bh_list);
-      lock_kernel();
-  }
-  BUG_ON (!list_empty(&jl->j_bh_list));
-  /*
-   * for the description block and all the log blocks, submit any buffers
-   * that haven't already reached the disk
-   */
-  atomic_inc(&journal->j_async_throttle);
-  for (i = 0 ; i < (jl->j_len + 1) ; i++) {
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
-         SB_ONDISK_JOURNAL_SIZE(s);
-    tbh = journal_find_get_block(s, bn) ;
-    if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */
-	ll_rw_block(WRITE, 1, &tbh) ;
-    put_bh(tbh) ;
-  }
-  atomic_dec(&journal->j_async_throttle);
-
-  /* wait on everything written so far before writing the commit
-   * if we are in barrier mode, send the commit down now
-   */
-  barrier = reiserfs_barrier_flush(s);
-  if (barrier) {
-      int ret;
-      lock_buffer(jl->j_commit_bh);
-      ret = submit_barrier_buffer(jl->j_commit_bh);
-      if (ret == -EOPNOTSUPP) {
-	  set_buffer_uptodate(jl->j_commit_bh);
-          disable_barrier(s);
-	  barrier = 0;
-      }
-  }
-  for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
-	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-    tbh = journal_find_get_block(s, bn) ;
-    wait_on_buffer(tbh) ;
-    // since we're using ll_rw_blk above, it might have skipped over
-    // a locked buffer.  Double check here
-    //
-    if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */
-      sync_dirty_buffer(tbh);
-    if (unlikely (!buffer_uptodate(tbh))) {
+static int flush_commit_list(struct super_block *s,
+			     struct reiserfs_journal_list *jl, int flushall)
+{
+	int i;
+	int bn;
+	struct buffer_head *tbh = NULL;
+	unsigned long trans_id = jl->j_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int barrier = 0;
+	int retval = 0;
+
+	reiserfs_check_lock_depth(s, "flush_commit_list");
+
+	if (atomic_read(&jl->j_older_commits_done)) {
+		return 0;
+	}
+
+	get_fs_excl();
+
+	/* before we can put our commit blocks on disk, we have to make sure everyone older than
+	 ** us is on disk too
+	 */
+	BUG_ON(jl->j_len <= 0);
+	BUG_ON(trans_id == journal->j_trans_id);
+
+	get_journal_list(jl);
+	if (flushall) {
+		if (flush_older_commits(s, jl) == 1) {
+			/* list disappeared during flush_older_commits.  return */
+			goto put_jl;
+		}
+	}
+
+	/* make sure nobody is trying to flush this one at the same time */
+	down(&jl->j_commit_lock);
+	if (!journal_list_still_alive(s, trans_id)) {
+		up(&jl->j_commit_lock);
+		goto put_jl;
+	}
+	BUG_ON(jl->j_trans_id == 0);
+
+	/* this commit is done, exit */
+	if (atomic_read(&(jl->j_commit_left)) <= 0) {
+		if (flushall) {
+			atomic_set(&(jl->j_older_commits_done), 1);
+		}
+		up(&jl->j_commit_lock);
+		goto put_jl;
+	}
+
+	if (!list_empty(&jl->j_bh_list)) {
+		unlock_kernel();
+		write_ordered_buffers(&journal->j_dirty_buffers_lock,
+				      journal, jl, &jl->j_bh_list);
+		lock_kernel();
+	}
+	BUG_ON(!list_empty(&jl->j_bh_list));
+	/*
+	 * for the description block and all the log blocks, submit any buffers
+	 * that haven't already reached the disk
+	 */
+	atomic_inc(&journal->j_async_throttle);
+	for (i = 0; i < (jl->j_len + 1); i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) %
+		    SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+		if (buffer_dirty(tbh))	/* redundant, ll_rw_block() checks */
+			ll_rw_block(WRITE, 1, &tbh);
+		put_bh(tbh);
+	}
+	atomic_dec(&journal->j_async_throttle);
+
+	/* wait on everything written so far before writing the commit
+	 * if we are in barrier mode, send the commit down now
+	 */
+	barrier = reiserfs_barrier_flush(s);
+	if (barrier) {
+		int ret;
+		lock_buffer(jl->j_commit_bh);
+		ret = submit_barrier_buffer(jl->j_commit_bh);
+		if (ret == -EOPNOTSUPP) {
+			set_buffer_uptodate(jl->j_commit_bh);
+			disable_barrier(s);
+			barrier = 0;
+		}
+	}
+	for (i = 0; i < (jl->j_len + 1); i++) {
+		bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+		    (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
+		tbh = journal_find_get_block(s, bn);
+		wait_on_buffer(tbh);
+		// since we're using ll_rw_blk above, it might have skipped over
+		// a locked buffer.  Double check here
+		//
+		if (buffer_dirty(tbh))	/* redundant, sync_dirty_buffer() checks */
+			sync_dirty_buffer(tbh);
+		if (unlikely(!buffer_uptodate(tbh))) {
 #ifdef CONFIG_REISERFS_CHECK
-      reiserfs_warning(s, "journal-601, buffer write failed") ;
+			reiserfs_warning(s, "journal-601, buffer write failed");
 #endif
-      retval = -EIO;
-    }
-    put_bh(tbh) ; /* once for journal_find_get_block */
-    put_bh(tbh) ;    /* once due to original getblk in do_journal_end */
-    atomic_dec(&(jl->j_commit_left)) ;
-  }
-
-  BUG_ON (atomic_read(&(jl->j_commit_left)) != 1);
-
-  if (!barrier) {
-      if (buffer_dirty(jl->j_commit_bh))
-	BUG();
-      mark_buffer_dirty(jl->j_commit_bh) ;
-      sync_dirty_buffer(jl->j_commit_bh) ;
-  } else
-      wait_on_buffer(jl->j_commit_bh);
-
-  check_barrier_completion(s, jl->j_commit_bh);
-
-  /* If there was a write error in the journal - we can't commit this
-   * transaction - it will be invalid and, if successful, will just end
-   * up propogating the write error out to the filesystem. */
-  if (unlikely (!buffer_uptodate(jl->j_commit_bh))) {
+			retval = -EIO;
+		}
+		put_bh(tbh);	/* once for journal_find_get_block */
+		put_bh(tbh);	/* once due to original getblk in do_journal_end */
+		atomic_dec(&(jl->j_commit_left));
+	}
+
+	BUG_ON(atomic_read(&(jl->j_commit_left)) != 1);
+
+	if (!barrier) {
+		if (buffer_dirty(jl->j_commit_bh))
+			BUG();
+		mark_buffer_dirty(jl->j_commit_bh);
+		sync_dirty_buffer(jl->j_commit_bh);
+	} else
+		wait_on_buffer(jl->j_commit_bh);
+
+	check_barrier_completion(s, jl->j_commit_bh);
+
+	/* If there was a write error in the journal - we can't commit this
+	 * transaction - it will be invalid and, if successful, will just end
+	 * up propogating the write error out to the filesystem. */
+	if (unlikely(!buffer_uptodate(jl->j_commit_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning(s, "journal-615: buffer write failed") ;
+		reiserfs_warning(s, "journal-615: buffer write failed");
 #endif
-    retval = -EIO;
-  }
-  bforget(jl->j_commit_bh) ;
-  if (journal->j_last_commit_id != 0 &&
-     (jl->j_trans_id - journal->j_last_commit_id) != 1) {
-      reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
-                       journal->j_last_commit_id,
-		       jl->j_trans_id);
-  }
-  journal->j_last_commit_id = jl->j_trans_id;
-
-  /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
-  cleanup_freed_for_journal_list(s, jl) ;
-
-  retval = retval ? retval : journal->j_errno;
-
-  /* mark the metadata dirty */
-  if (!retval)
-    dirty_one_transaction(s, jl);
-  atomic_dec(&(jl->j_commit_left)) ;
-
-  if (flushall) {
-    atomic_set(&(jl->j_older_commits_done), 1) ;
-  }
-  up(&jl->j_commit_lock);
-put_jl:
-  put_journal_list(s, jl);
-
-  if (retval)
-    reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
-  put_fs_excl();
-  return retval;
+		retval = -EIO;
+	}
+	bforget(jl->j_commit_bh);
+	if (journal->j_last_commit_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_commit_id) != 1) {
+		reiserfs_warning(s, "clm-2200: last commit %lu, current %lu",
+				 journal->j_last_commit_id, jl->j_trans_id);
+	}
+	journal->j_last_commit_id = jl->j_trans_id;
+
+	/* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
+	cleanup_freed_for_journal_list(s, jl);
+
+	retval = retval ? retval : journal->j_errno;
+
+	/* mark the metadata dirty */
+	if (!retval)
+		dirty_one_transaction(s, jl);
+	atomic_dec(&(jl->j_commit_left));
+
+	if (flushall) {
+		atomic_set(&(jl->j_older_commits_done), 1);
+	}
+	up(&jl->j_commit_lock);
+      put_jl:
+	put_journal_list(s, jl);
+
+	if (retval)
+		reiserfs_abort(s, retval, "Journal write error in %s",
+			       __FUNCTION__);
+	put_fs_excl();
+	return retval;
 }
 
 /*
 ** flush_journal_list frequently needs to find a newer transaction for a given block.  This does that, or 
 ** returns NULL if it can't find anything 
 */
-static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) {
-  struct super_block *sb = cn->sb;
-  b_blocknr_t blocknr = cn->blocknr ;
+static struct reiserfs_journal_list *find_newer_jl_for_cn(struct
+							  reiserfs_journal_cnode
+							  *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
 
-  cn = cn->hprev ;
-  while(cn) {
-    if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
-      return cn->jlist ;
-    }
-    cn = cn->hprev ;
-  }
-  return NULL ;
+	cn = cn->hprev;
+	while (cn) {
+		if (cn->sb == sb && cn->blocknr == blocknr && cn->jlist) {
+			return cn->jlist;
+		}
+		cn = cn->hprev;
+	}
+	return NULL;
 }
 
-static void remove_journal_hash(struct super_block *, struct reiserfs_journal_cnode **,
-struct reiserfs_journal_list *, unsigned long, int);
+static void remove_journal_hash(struct super_block *,
+				struct reiserfs_journal_cnode **,
+				struct reiserfs_journal_list *, unsigned long,
+				int);
 
 /*
 ** once all the real blocks have been flushed, it is safe to remove them from the
 ** journal list for this transaction.  Aside from freeing the cnode, this also allows the
 ** block to be reallocated for data blocks if it had been deleted.
 */
-static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_cnode *cn, *last ;
-  cn = jl->j_realblock ;
-
-  /* which is better, to lock once around the whole loop, or
-  ** to lock for each call to remove_journal_hash?
-  */
-  while(cn) {
-    if (cn->blocknr != 0) {
-      if (debug) {
-       reiserfs_warning (p_s_sb, "block %u, bh is %d, state %ld", cn->blocknr,
-                         cn->bh ? 1: 0, cn->state) ;
-      }
-      cn->state = 0 ;
-      remove_journal_hash(p_s_sb, journal->j_list_hash_table, jl, cn->blocknr, 1) ;
-    }
-    last = cn ;
-    cn = cn->next ;
-    free_cnode(p_s_sb, last) ;
-  }
-  jl->j_realblock = NULL ;
+static void remove_all_from_journal_list(struct super_block *p_s_sb,
+					 struct reiserfs_journal_list *jl,
+					 int debug)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_cnode *cn, *last;
+	cn = jl->j_realblock;
+
+	/* which is better, to lock once around the whole loop, or
+	 ** to lock for each call to remove_journal_hash?
+	 */
+	while (cn) {
+		if (cn->blocknr != 0) {
+			if (debug) {
+				reiserfs_warning(p_s_sb,
+						 "block %u, bh is %d, state %ld",
+						 cn->blocknr, cn->bh ? 1 : 0,
+						 cn->state);
+			}
+			cn->state = 0;
+			remove_journal_hash(p_s_sb, journal->j_list_hash_table,
+					    jl, cn->blocknr, 1);
+		}
+		last = cn;
+		cn = cn->next;
+		free_cnode(p_s_sb, last);
+	}
+	jl->j_realblock = NULL;
 }
 
 /*
@@ -1122,98 +1195,107 @@ static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reis
 ** called by flush_journal_list, before it calls remove_all_from_journal_list
 **
 */
-static int _update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) {
-  struct reiserfs_journal_header *jh ;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+static int _update_journal_header_block(struct super_block *p_s_sb,
+					unsigned long offset,
+					unsigned long trans_id)
+{
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 
-  if (reiserfs_is_journal_aborted (journal))
-    return -EIO;
+	if (reiserfs_is_journal_aborted(journal))
+		return -EIO;
 
-  if (trans_id >= journal->j_last_flush_trans_id) {
-    if (buffer_locked((journal->j_header_bh)))  {
-      wait_on_buffer((journal->j_header_bh)) ;
-      if (unlikely (!buffer_uptodate(journal->j_header_bh))) {
+	if (trans_id >= journal->j_last_flush_trans_id) {
+		if (buffer_locked((journal->j_header_bh))) {
+			wait_on_buffer((journal->j_header_bh));
+			if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
 #ifdef CONFIG_REISERFS_CHECK
-        reiserfs_warning (p_s_sb, "journal-699: buffer write failed") ;
+				reiserfs_warning(p_s_sb,
+						 "journal-699: buffer write failed");
 #endif
-        return -EIO;
-      }
-    }
-    journal->j_last_flush_trans_id = trans_id ;
-    journal->j_first_unflushed_offset = offset ;
-    jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
-    jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ;
-    jh->j_first_unflushed_offset = cpu_to_le32(offset) ;
-    jh->j_mount_id = cpu_to_le32(journal->j_mount_id) ;
-
-    if (reiserfs_barrier_flush(p_s_sb)) {
-	int ret;
-	lock_buffer(journal->j_header_bh);
-	ret = submit_barrier_buffer(journal->j_header_bh);
-	if (ret == -EOPNOTSUPP) {
-	    set_buffer_uptodate(journal->j_header_bh);
-	    disable_barrier(p_s_sb);
-	    goto sync;
-	}
-	wait_on_buffer(journal->j_header_bh);
-	check_barrier_completion(p_s_sb, journal->j_header_bh);
-    } else {
-sync:
-	set_buffer_dirty(journal->j_header_bh) ;
-	sync_dirty_buffer(journal->j_header_bh) ;
-    }
-    if (!buffer_uptodate(journal->j_header_bh)) {
-      reiserfs_warning (p_s_sb, "journal-837: IO error during journal replay");
-      return -EIO ;
-    }
-  }
-  return 0 ;
-}
-
-static int update_journal_header_block(struct super_block *p_s_sb, 
-                                       unsigned long offset, 
-				       unsigned long trans_id) {
-    return _update_journal_header_block(p_s_sb, offset, trans_id);
+				return -EIO;
+			}
+		}
+		journal->j_last_flush_trans_id = trans_id;
+		journal->j_first_unflushed_offset = offset;
+		jh = (struct reiserfs_journal_header *)(journal->j_header_bh->
+							b_data);
+		jh->j_last_flush_trans_id = cpu_to_le32(trans_id);
+		jh->j_first_unflushed_offset = cpu_to_le32(offset);
+		jh->j_mount_id = cpu_to_le32(journal->j_mount_id);
+
+		if (reiserfs_barrier_flush(p_s_sb)) {
+			int ret;
+			lock_buffer(journal->j_header_bh);
+			ret = submit_barrier_buffer(journal->j_header_bh);
+			if (ret == -EOPNOTSUPP) {
+				set_buffer_uptodate(journal->j_header_bh);
+				disable_barrier(p_s_sb);
+				goto sync;
+			}
+			wait_on_buffer(journal->j_header_bh);
+			check_barrier_completion(p_s_sb, journal->j_header_bh);
+		} else {
+		      sync:
+			set_buffer_dirty(journal->j_header_bh);
+			sync_dirty_buffer(journal->j_header_bh);
+		}
+		if (!buffer_uptodate(journal->j_header_bh)) {
+			reiserfs_warning(p_s_sb,
+					 "journal-837: IO error during journal replay");
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+static int update_journal_header_block(struct super_block *p_s_sb,
+				       unsigned long offset,
+				       unsigned long trans_id)
+{
+	return _update_journal_header_block(p_s_sb, offset, trans_id);
 }
+
 /* 
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
 static int flush_older_journal_lists(struct super_block *p_s_sb,
-                                     struct reiserfs_journal_list *jl)
-{
-    struct list_head *entry;
-    struct reiserfs_journal_list *other_jl ;
-    struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-    unsigned long trans_id = jl->j_trans_id;
-
-    /* we know we are the only ones flushing things, no extra race
-     * protection is required.
-     */
-restart:
-    entry = journal->j_journal_list.next;
-    /* Did we wrap? */
-    if (entry == &journal->j_journal_list)
-        return 0;
-    other_jl = JOURNAL_LIST_ENTRY(entry);
-    if (other_jl->j_trans_id < trans_id) {
-        BUG_ON (other_jl->j_refcount <= 0);
-	/* do not flush all */
-	flush_journal_list(p_s_sb, other_jl, 0) ;
-
-	/* other_jl is now deleted from the list */
-	goto restart;
-    }
-    return 0 ;
+				     struct reiserfs_journal_list *jl)
+{
+	struct list_head *entry;
+	struct reiserfs_journal_list *other_jl;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	unsigned long trans_id = jl->j_trans_id;
+
+	/* we know we are the only ones flushing things, no extra race
+	 * protection is required.
+	 */
+      restart:
+	entry = journal->j_journal_list.next;
+	/* Did we wrap? */
+	if (entry == &journal->j_journal_list)
+		return 0;
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (other_jl->j_trans_id < trans_id) {
+		BUG_ON(other_jl->j_refcount <= 0);
+		/* do not flush all */
+		flush_journal_list(p_s_sb, other_jl, 0);
+
+		/* other_jl is now deleted from the list */
+		goto restart;
+	}
+	return 0;
 }
 
 static void del_from_work_list(struct super_block *s,
-                               struct reiserfs_journal_list *jl) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    if (!list_empty(&jl->j_working_list)) {
-	list_del_init(&jl->j_working_list);
-	journal->j_num_work_lists--;
-    }
+			       struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (!list_empty(&jl->j_working_list)) {
+		list_del_init(&jl->j_working_list);
+		journal->j_num_work_lists--;
+	}
 }
 
 /* flush a journal list, both commit and real blocks
@@ -1225,439 +1307,461 @@ static void del_from_work_list(struct super_block *s,
 ** and the journal is locked.  That means it can only be called from 
 ** do_journal_end, or by journal_release
 */
-static int flush_journal_list(struct super_block *s, 
-                              struct reiserfs_journal_list *jl, int flushall) {
-  struct reiserfs_journal_list *pjl ;
-  struct reiserfs_journal_cnode *cn, *last ;
-  int count ;
-  int was_jwait = 0 ;
-  int was_dirty = 0 ;
-  struct buffer_head *saved_bh ; 
-  unsigned long j_len_saved = jl->j_len ;
-  struct reiserfs_journal *journal = SB_JOURNAL (s);
-  int err = 0;
-
-  BUG_ON (j_len_saved <= 0);
-
-  if (atomic_read(&journal->j_wcount) != 0) {
-    reiserfs_warning(s, "clm-2048: flush_journal_list called with wcount %d",
-                      atomic_read(&journal->j_wcount)) ;
-  }
-  BUG_ON (jl->j_trans_id == 0);
-
-  /* if flushall == 0, the lock is already held */
-  if (flushall) {
-      down(&journal->j_flush_sem);
-  } else if (!down_trylock(&journal->j_flush_sem)) {
-      BUG();
-  }
-
-  count = 0 ;
-  if (j_len_saved > journal->j_trans_max) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
-    return 0 ;
-  }
-
-  get_fs_excl();
-
-  /* if all the work is already done, get out of here */
-  if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
-      atomic_read(&(jl->j_commit_left)) <= 0) {
-    goto flush_older_and_return ;
-  } 
-
-  /* start by putting the commit list on disk.  This will also flush 
-  ** the commit lists of any olders transactions
-  */
-  flush_commit_list(s, jl, 1) ;
-
-  if (!(jl->j_state & LIST_DIRTY) && !reiserfs_is_journal_aborted (journal))
-      BUG();
-
-  /* are we done now? */
-  if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
-      atomic_read(&(jl->j_commit_left)) <= 0) {
-    goto flush_older_and_return ;
-  }
-
-  /* loop through each cnode, see if we need to write it, 
-  ** or wait on a more recent transaction, or just ignore it 
-  */
-  if (atomic_read(&(journal->j_wcount)) != 0) {
-    reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ;
-  }
-  cn = jl->j_realblock ;
-  while(cn) {
-    was_jwait = 0 ;
-    was_dirty = 0 ;
-    saved_bh = NULL ;
-    /* blocknr of 0 is no longer in the hash, ignore it */
-    if (cn->blocknr == 0) {
-      goto free_cnode ;
-    }
-
-    /* This transaction failed commit. Don't write out to the disk */
-    if (!(jl->j_state & LIST_DIRTY))
-        goto free_cnode;
-
-    pjl = find_newer_jl_for_cn(cn) ;
-    /* the order is important here.  We check pjl to make sure we
-    ** don't clear BH_JDirty_wait if we aren't the one writing this
-    ** block to disk
-    */
-    if (!pjl && cn->bh) {
-      saved_bh = cn->bh ;
-
-      /* we do this to make sure nobody releases the buffer while 
-      ** we are working with it 
-      */
-      get_bh(saved_bh) ;
-
-      if (buffer_journal_dirty(saved_bh)) {
-        BUG_ON (!can_dirty (cn));
-        was_jwait = 1 ;
-        was_dirty = 1 ;
-      } else if (can_dirty(cn)) {
-        /* everything with !pjl && jwait should be writable */
-	BUG();
-      }
-    }
-
-    /* if someone has this block in a newer transaction, just make
-    ** sure they are commited, and don't try writing it to disk
-    */
-    if (pjl) {
-      if (atomic_read(&pjl->j_commit_left))
-        flush_commit_list(s, pjl, 1) ;
-      goto free_cnode ;
-    }
-
-    /* bh == NULL when the block got to disk on its own, OR, 
-    ** the block got freed in a future transaction 
-    */
-    if (saved_bh == NULL) {
-      goto free_cnode ;
-    }
-
-    /* this should never happen.  kupdate_one_transaction has this list
-    ** locked while it works, so we should never see a buffer here that
-    ** is not marked JDirty_wait
-    */
-    if ((!was_jwait) && !buffer_locked(saved_bh)) {
-	reiserfs_warning (s, "journal-813: BAD! buffer %llu %cdirty %cjwait, "
-			  "not in a newer tranasction",
-			  (unsigned long long)saved_bh->b_blocknr,
-			  was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
-    }
-    if (was_dirty) { 
-      /* we inc again because saved_bh gets decremented at free_cnode */
-      get_bh(saved_bh) ;
-      set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-      lock_buffer(saved_bh);
-      BUG_ON (cn->blocknr != saved_bh->b_blocknr);
-      if (buffer_dirty(saved_bh))
-        submit_logged_buffer(saved_bh) ;
-      else
-        unlock_buffer(saved_bh);
-      count++ ;
-    } else {
-      reiserfs_warning (s, "clm-2082: Unable to flush buffer %llu in %s",
-                        (unsigned long long)saved_bh->b_blocknr, __FUNCTION__);
-    }
-free_cnode:
-    last = cn ;
-    cn = cn->next ;
-    if (saved_bh) {
-      /* we incremented this to keep others from taking the buffer head away */
-      put_bh(saved_bh) ;
-      if (atomic_read(&(saved_bh->b_count)) < 0) {
-        reiserfs_warning (s, "journal-945: saved_bh->b_count < 0");
-      }
-    }
-  }
-  if (count > 0) {
-    cn = jl->j_realblock ;
-    while(cn) {
-      if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-	if (!cn->bh) {
-	  reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ;
-	}
-	wait_on_buffer(cn->bh) ;
-	if (!cn->bh) {
-	  reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ;
-	}
-	if (unlikely (!buffer_uptodate(cn->bh))) {
-#ifdef CONFIG_REISERFS_CHECK
-	  reiserfs_warning(s, "journal-949: buffer write failed\n") ;
-#endif
-          err = -EIO;
-  	}
-	/* note, we must clear the JDirty_wait bit after the up to date
-	** check, otherwise we race against our flushpage routine
-	*/
-        BUG_ON (!test_clear_buffer_journal_dirty (cn->bh));
-
-        /* undo the inc from journal_mark_dirty */
-	put_bh(cn->bh) ;
-        brelse(cn->bh) ;
-      }
-      cn = cn->next ;
-    }
-  }
-
-  if (err)
-    reiserfs_abort (s, -EIO, "Write error while pushing transaction to disk in %s", __FUNCTION__);
-flush_older_and_return:
-
-
-  /* before we can update the journal header block, we _must_ flush all 
-  ** real blocks from all older transactions to disk.  This is because
-  ** once the header block is updated, this transaction will not be
-  ** replayed after a crash
-  */
-  if (flushall) {
-    flush_older_journal_lists(s, jl);
-  } 
-  
-  err = journal->j_errno;
-  /* before we can remove everything from the hash tables for this 
-  ** transaction, we must make sure it can never be replayed
-  **
-  ** since we are only called from do_journal_end, we know for sure there
-  ** are no allocations going on while we are flushing journal lists.  So,
-  ** we only need to update the journal header block for the last list
-  ** being flushed
-  */
-  if (!err && flushall) {
-    err = update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
-    if (err)
-        reiserfs_abort (s, -EIO, "Write error while updating journal header in %s", __FUNCTION__);
-  }
-  remove_all_from_journal_list(s, jl, 0) ;
-  list_del_init(&jl->j_list);
-  journal->j_num_lists--;
-  del_from_work_list(s, jl);
-
-  if (journal->j_last_flush_id != 0 &&
-     (jl->j_trans_id - journal->j_last_flush_id) != 1) {
-      reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
-                       journal->j_last_flush_id,
-		       jl->j_trans_id);
-  }
-  journal->j_last_flush_id = jl->j_trans_id;
-
-  /* not strictly required since we are freeing the list, but it should
-   * help find code using dead lists later on
-   */
-  jl->j_len = 0 ;
-  atomic_set(&(jl->j_nonzerolen), 0) ;
-  jl->j_start = 0 ;
-  jl->j_realblock = NULL ;
-  jl->j_commit_bh = NULL ;
-  jl->j_trans_id = 0 ;
-  jl->j_state = 0;
-  put_journal_list(s, jl);
-  if (flushall)
-    up(&journal->j_flush_sem);
-  put_fs_excl();
-  return err ;
-} 
-
-static int write_one_transaction(struct super_block *s,
-                                 struct reiserfs_journal_list *jl,
-				 struct buffer_chunk *chunk)
+static int flush_journal_list(struct super_block *s,
+			      struct reiserfs_journal_list *jl, int flushall)
 {
-    struct reiserfs_journal_cnode *cn;
-    int ret = 0 ;
-
-    jl->j_state |= LIST_TOUCHED;
-    del_from_work_list(s, jl);
-    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
-        return 0;
-    }
-
-    cn = jl->j_realblock ;
-    while(cn) {
-        /* if the blocknr == 0, this has been cleared from the hash,
-        ** skip it
-        */
-        if (cn->blocknr == 0) {
-            goto next ;
-        }
-        if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
-	    struct buffer_head *tmp_bh;
-	    /* we can race against journal_mark_freed when we try
-	     * to lock_buffer(cn->bh), so we have to inc the buffer
-	     * count, and recheck things after locking
-	     */
-	    tmp_bh = cn->bh;
-	    get_bh(tmp_bh);
-	    lock_buffer(tmp_bh);
-	    if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
-		if (!buffer_journal_dirty(tmp_bh) ||
-		    buffer_journal_prepared(tmp_bh))
-		    BUG();
-		add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
-		ret++;
-	    } else {
-		/* note, cn->bh might be null now */
-		unlock_buffer(tmp_bh);
-	    }
-	    put_bh(tmp_bh);
-        }
-next:
-        cn = cn->next ;
-	cond_resched();
-    }
-    return ret ;
-}
+	struct reiserfs_journal_list *pjl;
+	struct reiserfs_journal_cnode *cn, *last;
+	int count;
+	int was_jwait = 0;
+	int was_dirty = 0;
+	struct buffer_head *saved_bh;
+	unsigned long j_len_saved = jl->j_len;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int err = 0;
+
+	BUG_ON(j_len_saved <= 0);
+
+	if (atomic_read(&journal->j_wcount) != 0) {
+		reiserfs_warning(s,
+				 "clm-2048: flush_journal_list called with wcount %d",
+				 atomic_read(&journal->j_wcount));
+	}
+	BUG_ON(jl->j_trans_id == 0);
 
-/* used by flush_commit_list */
-static int dirty_one_transaction(struct super_block *s,
-                                 struct reiserfs_journal_list *jl)
-{
-    struct reiserfs_journal_cnode *cn;
-    struct reiserfs_journal_list *pjl;
-    int ret = 0 ;
-
-    jl->j_state |= LIST_DIRTY;
-    cn = jl->j_realblock ;
-    while(cn) {
-        /* look for a more recent transaction that logged this
-        ** buffer.  Only the most recent transaction with a buffer in
-        ** it is allowed to send that buffer to disk
-        */
-	pjl = find_newer_jl_for_cn(cn) ;
-        if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
-	{
-	    BUG_ON (!can_dirty(cn));
-	    /* if the buffer is prepared, it will either be logged
-	     * or restored.  If restored, we need to make sure
-	     * it actually gets marked dirty
-	     */
-            clear_buffer_journal_new (cn->bh);
-            if (buffer_journal_prepared (cn->bh)) {
-                set_buffer_journal_restore_dirty (cn->bh);
-	    } else {
-                set_buffer_journal_test (cn->bh);
-	        mark_buffer_dirty(cn->bh);
-	    }
-        } 
-        cn = cn->next ;
-    }
-    return ret ;
-}
+	/* if flushall == 0, the lock is already held */
+	if (flushall) {
+		down(&journal->j_flush_sem);
+	} else if (!down_trylock(&journal->j_flush_sem)) {
+		BUG();
+	}
 
-static int kupdate_transactions(struct super_block *s,
-                                   struct reiserfs_journal_list *jl,
-				   struct reiserfs_journal_list **next_jl,
-				   unsigned long *next_trans_id,
-				   int num_blocks,
-				   int num_trans) {
-    int ret = 0;
-    int written = 0 ;
-    int transactions_flushed = 0;
-    unsigned long orig_trans_id = jl->j_trans_id;
-    struct buffer_chunk chunk;
-    struct list_head *entry;
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    chunk.nr = 0;
-
-    down(&journal->j_flush_sem);
-    if (!journal_list_still_alive(s, orig_trans_id)) {
-	goto done;
-    }
-
-    /* we've got j_flush_sem held, nobody is going to delete any
-     * of these lists out from underneath us
-     */
-    while((num_trans && transactions_flushed < num_trans) ||
-          (!num_trans && written < num_blocks)) {
-
-	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
-	    atomic_read(&jl->j_commit_left) || !(jl->j_state & LIST_DIRTY))
-	{
-	    del_from_work_list(s, jl);
-	    break;
-	}
-	ret = write_one_transaction(s, jl, &chunk);
-
-	if (ret < 0)
-	    goto done;
-	transactions_flushed++;
-	written += ret;
-	entry = jl->j_list.next;
-
-	/* did we wrap? */
-	if (entry == &journal->j_journal_list) {
-	    break;
-        }
-	jl = JOURNAL_LIST_ENTRY(entry);
-
-	/* don't bother with older transactions */
-	if (jl->j_trans_id <= orig_trans_id)
-	    break;
-    }
-    if (chunk.nr) {
-        write_chunk(&chunk);
-    }
-
-done:
-    up(&journal->j_flush_sem);
-    return ret;
-}
+	count = 0;
+	if (j_len_saved > journal->j_trans_max) {
+		reiserfs_panic(s,
+			       "journal-715: flush_journal_list, length is %lu, trans id %lu\n",
+			       j_len_saved, jl->j_trans_id);
+		return 0;
+	}
 
-/* for o_sync and fsync heavy applications, they tend to use
-** all the journa list slots with tiny transactions.  These
-** trigger lots and lots of calls to update the header block, which
-** adds seeks and slows things down.
-**
-** This function tries to clear out a large chunk of the journal lists
-** at once, which makes everything faster since only the newest journal
+	get_fs_excl();
+
+	/* if all the work is already done, get out of here */
+	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+	    atomic_read(&(jl->j_commit_left)) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/* start by putting the commit list on disk.  This will also flush 
+	 ** the commit lists of any olders transactions
+	 */
+	flush_commit_list(s, jl, 1);
+
+	if (!(jl->j_state & LIST_DIRTY)
+	    && !reiserfs_is_journal_aborted(journal))
+		BUG();
+
+	/* are we done now? */
+	if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
+	    atomic_read(&(jl->j_commit_left)) <= 0) {
+		goto flush_older_and_return;
+	}
+
+	/* loop through each cnode, see if we need to write it, 
+	 ** or wait on a more recent transaction, or just ignore it 
+	 */
+	if (atomic_read(&(journal->j_wcount)) != 0) {
+		reiserfs_panic(s,
+			       "journal-844: panic journal list is flushing, wcount is not 0\n");
+	}
+	cn = jl->j_realblock;
+	while (cn) {
+		was_jwait = 0;
+		was_dirty = 0;
+		saved_bh = NULL;
+		/* blocknr of 0 is no longer in the hash, ignore it */
+		if (cn->blocknr == 0) {
+			goto free_cnode;
+		}
+
+		/* This transaction failed commit. Don't write out to the disk */
+		if (!(jl->j_state & LIST_DIRTY))
+			goto free_cnode;
+
+		pjl = find_newer_jl_for_cn(cn);
+		/* the order is important here.  We check pjl to make sure we
+		 ** don't clear BH_JDirty_wait if we aren't the one writing this
+		 ** block to disk
+		 */
+		if (!pjl && cn->bh) {
+			saved_bh = cn->bh;
+
+			/* we do this to make sure nobody releases the buffer while 
+			 ** we are working with it 
+			 */
+			get_bh(saved_bh);
+
+			if (buffer_journal_dirty(saved_bh)) {
+				BUG_ON(!can_dirty(cn));
+				was_jwait = 1;
+				was_dirty = 1;
+			} else if (can_dirty(cn)) {
+				/* everything with !pjl && jwait should be writable */
+				BUG();
+			}
+		}
+
+		/* if someone has this block in a newer transaction, just make
+		 ** sure they are commited, and don't try writing it to disk
+		 */
+		if (pjl) {
+			if (atomic_read(&pjl->j_commit_left))
+				flush_commit_list(s, pjl, 1);
+			goto free_cnode;
+		}
+
+		/* bh == NULL when the block got to disk on its own, OR, 
+		 ** the block got freed in a future transaction 
+		 */
+		if (saved_bh == NULL) {
+			goto free_cnode;
+		}
+
+		/* this should never happen.  kupdate_one_transaction has this list
+		 ** locked while it works, so we should never see a buffer here that
+		 ** is not marked JDirty_wait
+		 */
+		if ((!was_jwait) && !buffer_locked(saved_bh)) {
+			reiserfs_warning(s,
+					 "journal-813: BAD! buffer %llu %cdirty %cjwait, "
+					 "not in a newer tranasction",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, was_dirty ? ' ' : '!',
+					 was_jwait ? ' ' : '!');
+		}
+		if (was_dirty) {
+			/* we inc again because saved_bh gets decremented at free_cnode */
+			get_bh(saved_bh);
+			set_bit(BLOCK_NEEDS_FLUSH, &cn->state);
+			lock_buffer(saved_bh);
+			BUG_ON(cn->blocknr != saved_bh->b_blocknr);
+			if (buffer_dirty(saved_bh))
+				submit_logged_buffer(saved_bh);
+			else
+				unlock_buffer(saved_bh);
+			count++;
+		} else {
+			reiserfs_warning(s,
+					 "clm-2082: Unable to flush buffer %llu in %s",
+					 (unsigned long long)saved_bh->
+					 b_blocknr, __FUNCTION__);
+		}
+	      free_cnode:
+		last = cn;
+		cn = cn->next;
+		if (saved_bh) {
+			/* we incremented this to keep others from taking the buffer head away */
+			put_bh(saved_bh);
+			if (atomic_read(&(saved_bh->b_count)) < 0) {
+				reiserfs_warning(s,
+						 "journal-945: saved_bh->b_count < 0");
+			}
+		}
+	}
+	if (count > 0) {
+		cn = jl->j_realblock;
+		while (cn) {
+			if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
+				if (!cn->bh) {
+					reiserfs_panic(s,
+						       "journal-1011: cn->bh is NULL\n");
+				}
+				wait_on_buffer(cn->bh);
+				if (!cn->bh) {
+					reiserfs_panic(s,
+						       "journal-1012: cn->bh is NULL\n");
+				}
+				if (unlikely(!buffer_uptodate(cn->bh))) {
+#ifdef CONFIG_REISERFS_CHECK
+					reiserfs_warning(s,
+							 "journal-949: buffer write failed\n");
+#endif
+					err = -EIO;
+				}
+				/* note, we must clear the JDirty_wait bit after the up to date
+				 ** check, otherwise we race against our flushpage routine
+				 */
+				BUG_ON(!test_clear_buffer_journal_dirty
+				       (cn->bh));
+
+				/* undo the inc from journal_mark_dirty */
+				put_bh(cn->bh);
+				brelse(cn->bh);
+			}
+			cn = cn->next;
+		}
+	}
+
+	if (err)
+		reiserfs_abort(s, -EIO,
+			       "Write error while pushing transaction to disk in %s",
+			       __FUNCTION__);
+      flush_older_and_return:
+
+	/* before we can update the journal header block, we _must_ flush all 
+	 ** real blocks from all older transactions to disk.  This is because
+	 ** once the header block is updated, this transaction will not be
+	 ** replayed after a crash
+	 */
+	if (flushall) {
+		flush_older_journal_lists(s, jl);
+	}
+
+	err = journal->j_errno;
+	/* before we can remove everything from the hash tables for this 
+	 ** transaction, we must make sure it can never be replayed
+	 **
+	 ** since we are only called from do_journal_end, we know for sure there
+	 ** are no allocations going on while we are flushing journal lists.  So,
+	 ** we only need to update the journal header block for the last list
+	 ** being flushed
+	 */
+	if (!err && flushall) {
+		err =
+		    update_journal_header_block(s,
+						(jl->j_start + jl->j_len +
+						 2) % SB_ONDISK_JOURNAL_SIZE(s),
+						jl->j_trans_id);
+		if (err)
+			reiserfs_abort(s, -EIO,
+				       "Write error while updating journal header in %s",
+				       __FUNCTION__);
+	}
+	remove_all_from_journal_list(s, jl, 0);
+	list_del_init(&jl->j_list);
+	journal->j_num_lists--;
+	del_from_work_list(s, jl);
+
+	if (journal->j_last_flush_id != 0 &&
+	    (jl->j_trans_id - journal->j_last_flush_id) != 1) {
+		reiserfs_warning(s, "clm-2201: last flush %lu, current %lu",
+				 journal->j_last_flush_id, jl->j_trans_id);
+	}
+	journal->j_last_flush_id = jl->j_trans_id;
+
+	/* not strictly required since we are freeing the list, but it should
+	 * help find code using dead lists later on
+	 */
+	jl->j_len = 0;
+	atomic_set(&(jl->j_nonzerolen), 0);
+	jl->j_start = 0;
+	jl->j_realblock = NULL;
+	jl->j_commit_bh = NULL;
+	jl->j_trans_id = 0;
+	jl->j_state = 0;
+	put_journal_list(s, jl);
+	if (flushall)
+		up(&journal->j_flush_sem);
+	put_fs_excl();
+	return err;
+}
+
+static int write_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
+{
+	struct reiserfs_journal_cnode *cn;
+	int ret = 0;
+
+	jl->j_state |= LIST_TOUCHED;
+	del_from_work_list(s, jl);
+	if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+		return 0;
+	}
+
+	cn = jl->j_realblock;
+	while (cn) {
+		/* if the blocknr == 0, this has been cleared from the hash,
+		 ** skip it
+		 */
+		if (cn->blocknr == 0) {
+			goto next;
+		}
+		if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			/* we can race against journal_mark_freed when we try
+			 * to lock_buffer(cn->bh), so we have to inc the buffer
+			 * count, and recheck things after locking
+			 */
+			tmp_bh = cn->bh;
+			get_bh(tmp_bh);
+			lock_buffer(tmp_bh);
+			if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+				if (!buffer_journal_dirty(tmp_bh) ||
+				    buffer_journal_prepared(tmp_bh))
+					BUG();
+				add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
+				ret++;
+			} else {
+				/* note, cn->bh might be null now */
+				unlock_buffer(tmp_bh);
+			}
+			put_bh(tmp_bh);
+		}
+	      next:
+		cn = cn->next;
+		cond_resched();
+	}
+	return ret;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+				 struct reiserfs_journal_list *jl)
+{
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal_list *pjl;
+	int ret = 0;
+
+	jl->j_state |= LIST_DIRTY;
+	cn = jl->j_realblock;
+	while (cn) {
+		/* look for a more recent transaction that logged this
+		 ** buffer.  Only the most recent transaction with a buffer in
+		 ** it is allowed to send that buffer to disk
+		 */
+		pjl = find_newer_jl_for_cn(cn);
+		if (!pjl && cn->blocknr && cn->bh
+		    && buffer_journal_dirty(cn->bh)) {
+			BUG_ON(!can_dirty(cn));
+			/* if the buffer is prepared, it will either be logged
+			 * or restored.  If restored, we need to make sure
+			 * it actually gets marked dirty
+			 */
+			clear_buffer_journal_new(cn->bh);
+			if (buffer_journal_prepared(cn->bh)) {
+				set_buffer_journal_restore_dirty(cn->bh);
+			} else {
+				set_buffer_journal_test(cn->bh);
+				mark_buffer_dirty(cn->bh);
+			}
+		}
+		cn = cn->next;
+	}
+	return ret;
+}
+
+static int kupdate_transactions(struct super_block *s,
+				struct reiserfs_journal_list *jl,
+				struct reiserfs_journal_list **next_jl,
+				unsigned long *next_trans_id,
+				int num_blocks, int num_trans)
+{
+	int ret = 0;
+	int written = 0;
+	int transactions_flushed = 0;
+	unsigned long orig_trans_id = jl->j_trans_id;
+	struct buffer_chunk chunk;
+	struct list_head *entry;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	chunk.nr = 0;
+
+	down(&journal->j_flush_sem);
+	if (!journal_list_still_alive(s, orig_trans_id)) {
+		goto done;
+	}
+
+	/* we've got j_flush_sem held, nobody is going to delete any
+	 * of these lists out from underneath us
+	 */
+	while ((num_trans && transactions_flushed < num_trans) ||
+	       (!num_trans && written < num_blocks)) {
+
+		if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+		    atomic_read(&jl->j_commit_left)
+		    || !(jl->j_state & LIST_DIRTY)) {
+			del_from_work_list(s, jl);
+			break;
+		}
+		ret = write_one_transaction(s, jl, &chunk);
+
+		if (ret < 0)
+			goto done;
+		transactions_flushed++;
+		written += ret;
+		entry = jl->j_list.next;
+
+		/* did we wrap? */
+		if (entry == &journal->j_journal_list) {
+			break;
+		}
+		jl = JOURNAL_LIST_ENTRY(entry);
+
+		/* don't bother with older transactions */
+		if (jl->j_trans_id <= orig_trans_id)
+			break;
+	}
+	if (chunk.nr) {
+		write_chunk(&chunk);
+	}
+
+      done:
+	up(&journal->j_flush_sem);
+	return ret;
+}
+
+/* for o_sync and fsync heavy applications, they tend to use
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
+**
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
 ** list updates the header block
 */
 static int flush_used_journal_lists(struct super_block *s,
-                                    struct reiserfs_journal_list *jl) {
-    unsigned long len = 0;
-    unsigned long cur_len;
-    int ret;
-    int i;
-    int limit = 256;
-    struct reiserfs_journal_list *tjl;
-    struct reiserfs_journal_list *flush_jl;
-    unsigned long trans_id;
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-
-    flush_jl = tjl = jl;
-
-    /* in data logging mode, try harder to flush a lot of blocks */
-    if (reiserfs_data_log(s))
-	limit = 1024;
-    /* flush for 256 transactions or limit blocks, whichever comes first */
-    for(i = 0 ; i < 256 && len < limit ; i++) {
-	if (atomic_read(&tjl->j_commit_left) ||
-	    tjl->j_trans_id < jl->j_trans_id) {
-	    break;
-	}
-	cur_len = atomic_read(&tjl->j_nonzerolen);
-	if (cur_len > 0) {
-	    tjl->j_state &= ~LIST_TOUCHED;
-	}
-	len += cur_len;
-	flush_jl = tjl;
-	if (tjl->j_list.next == &journal->j_journal_list)
-	    break;
-	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
-    }
-    /* try to find a group of blocks we can flush across all the
-    ** transactions, but only bother if we've actually spanned
-    ** across multiple lists
-    */
-    if (flush_jl != jl) {
-        ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
-    }
-    flush_journal_list(s, flush_jl, 1);
-    return 0;
+				    struct reiserfs_journal_list *jl)
+{
+	unsigned long len = 0;
+	unsigned long cur_len;
+	int ret;
+	int i;
+	int limit = 256;
+	struct reiserfs_journal_list *tjl;
+	struct reiserfs_journal_list *flush_jl;
+	unsigned long trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+
+	flush_jl = tjl = jl;
+
+	/* in data logging mode, try harder to flush a lot of blocks */
+	if (reiserfs_data_log(s))
+		limit = 1024;
+	/* flush for 256 transactions or limit blocks, whichever comes first */
+	for (i = 0; i < 256 && len < limit; i++) {
+		if (atomic_read(&tjl->j_commit_left) ||
+		    tjl->j_trans_id < jl->j_trans_id) {
+			break;
+		}
+		cur_len = atomic_read(&tjl->j_nonzerolen);
+		if (cur_len > 0) {
+			tjl->j_state &= ~LIST_TOUCHED;
+		}
+		len += cur_len;
+		flush_jl = tjl;
+		if (tjl->j_list.next == &journal->j_journal_list)
+			break;
+		tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+	}
+	/* try to find a group of blocks we can flush across all the
+	 ** transactions, but only bother if we've actually spanned
+	 ** across multiple lists
+	 */
+	if (flush_jl != jl) {
+		ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
+	}
+	flush_journal_list(s, flush_jl, 1);
+	return 0;
 }
 
 /*
@@ -1665,207 +1769,248 @@ static int flush_used_journal_lists(struct super_block *s,
 ** only touchs the hnext and hprev pointers.
 */
 void remove_journal_hash(struct super_block *sb,
-			struct reiserfs_journal_cnode **table,
-			struct reiserfs_journal_list *jl,
-			unsigned long block, int remove_freed)
-{
-  struct reiserfs_journal_cnode *cur ;
-  struct reiserfs_journal_cnode **head ;
-
-  head= &(journal_hash(table, sb, block)) ;
-  if (!head) {
-    return ;
-  }
-  cur = *head ;
-  while(cur) {
-    if (cur->blocknr == block && cur->sb == sb && (jl == NULL || jl == cur->jlist) && 
-        (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
-      if (cur->hnext) {
-        cur->hnext->hprev = cur->hprev ;
-      }
-      if (cur->hprev) {
-	cur->hprev->hnext = cur->hnext ;
-      } else {
-	*head = cur->hnext ;
-      }
-      cur->blocknr = 0 ;
-      cur->sb = NULL ;
-      cur->state = 0 ;
-      if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */
-	atomic_dec(&(cur->jlist->j_nonzerolen)) ;
-      cur->bh = NULL ;
-      cur->jlist = NULL ;
-    } 
-    cur = cur->hnext ;
-  }
-}
-
-static void free_journal_ram(struct super_block *p_s_sb) {
-  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-  reiserfs_kfree(journal->j_current_jl,
-                 sizeof(struct reiserfs_journal_list), p_s_sb);
-  journal->j_num_lists--;
-
-  vfree(journal->j_cnode_free_orig) ;
-  free_list_bitmaps(p_s_sb, journal->j_list_bitmap) ;
-  free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
-  if (journal->j_header_bh) {
-    brelse(journal->j_header_bh) ;
-  }
-  /* j_header_bh is on the journal dev, make sure not to release the journal
-   * dev until we brelse j_header_bh
-   */
-  release_journal_dev(p_s_sb, journal);
-  vfree(journal) ;
+			 struct reiserfs_journal_cnode **table,
+			 struct reiserfs_journal_list *jl,
+			 unsigned long block, int remove_freed)
+{
+	struct reiserfs_journal_cnode *cur;
+	struct reiserfs_journal_cnode **head;
+
+	head = &(journal_hash(table, sb, block));
+	if (!head) {
+		return;
+	}
+	cur = *head;
+	while (cur) {
+		if (cur->blocknr == block && cur->sb == sb
+		    && (jl == NULL || jl == cur->jlist)
+		    && (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) {
+			if (cur->hnext) {
+				cur->hnext->hprev = cur->hprev;
+			}
+			if (cur->hprev) {
+				cur->hprev->hnext = cur->hnext;
+			} else {
+				*head = cur->hnext;
+			}
+			cur->blocknr = 0;
+			cur->sb = NULL;
+			cur->state = 0;
+			if (cur->bh && cur->jlist)	/* anybody who clears the cur->bh will also dec the nonzerolen */
+				atomic_dec(&(cur->jlist->j_nonzerolen));
+			cur->bh = NULL;
+			cur->jlist = NULL;
+		}
+		cur = cur->hnext;
+	}
+}
+
+static void free_journal_ram(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	reiserfs_kfree(journal->j_current_jl,
+		       sizeof(struct reiserfs_journal_list), p_s_sb);
+	journal->j_num_lists--;
+
+	vfree(journal->j_cnode_free_orig);
+	free_list_bitmaps(p_s_sb, journal->j_list_bitmap);
+	free_bitmap_nodes(p_s_sb);	/* must be after free_list_bitmaps */
+	if (journal->j_header_bh) {
+		brelse(journal->j_header_bh);
+	}
+	/* j_header_bh is on the journal dev, make sure not to release the journal
+	 * dev until we brelse j_header_bh
+	 */
+	release_journal_dev(p_s_sb, journal);
+	vfree(journal);
 }
 
 /*
 ** call on unmount.  Only set error to 1 if you haven't made your way out
 ** of read_super() yet.  Any other caller must keep error at 0.
 */
-static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) {
-  struct reiserfs_transaction_handle myth ;
-  int flushed = 0;
-  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-
-  /* we only want to flush out transactions if we were called with error == 0
-  */
-  if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
-    /* end the current trans */
-    BUG_ON (!th->t_trans_id);
-    do_journal_end(th, p_s_sb,10, FLUSH_ALL) ;
-
-    /* make sure something gets logged to force our way into the flush code */
-    if (!journal_join(&myth, p_s_sb, 1)) {
-        reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-        journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-        do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ;
-        flushed = 1;
-    }
-  }
-
-  /* this also catches errors during the do_journal_end above */
-  if (!error && reiserfs_is_journal_aborted(journal)) {
-      memset(&myth, 0, sizeof(myth));
-      if (!journal_join_abort(&myth, p_s_sb, 1)) {
-	  reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-	  journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-          do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL) ;
-      }
-  }
-
-  reiserfs_mounted_fs_count-- ;
-  /* wait for all commits to finish */
-  cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
-  flush_workqueue(commit_wq);
-  if (!reiserfs_mounted_fs_count) {
-    destroy_workqueue(commit_wq);
-    commit_wq = NULL;
-  }
-
-  free_journal_ram(p_s_sb) ;
-
-  return 0 ;
+static int do_journal_release(struct reiserfs_transaction_handle *th,
+			      struct super_block *p_s_sb, int error)
+{
+	struct reiserfs_transaction_handle myth;
+	int flushed = 0;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+
+	/* we only want to flush out transactions if we were called with error == 0
+	 */
+	if (!error && !(p_s_sb->s_flags & MS_RDONLY)) {
+		/* end the current trans */
+		BUG_ON(!th->t_trans_id);
+		do_journal_end(th, p_s_sb, 10, FLUSH_ALL);
+
+		/* make sure something gets logged to force our way into the flush code */
+		if (!journal_join(&myth, p_s_sb, 1)) {
+			reiserfs_prepare_for_journal(p_s_sb,
+						     SB_BUFFER_WITH_SB(p_s_sb),
+						     1);
+			journal_mark_dirty(&myth, p_s_sb,
+					   SB_BUFFER_WITH_SB(p_s_sb));
+			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+			flushed = 1;
+		}
+	}
+
+	/* this also catches errors during the do_journal_end above */
+	if (!error && reiserfs_is_journal_aborted(journal)) {
+		memset(&myth, 0, sizeof(myth));
+		if (!journal_join_abort(&myth, p_s_sb, 1)) {
+			reiserfs_prepare_for_journal(p_s_sb,
+						     SB_BUFFER_WITH_SB(p_s_sb),
+						     1);
+			journal_mark_dirty(&myth, p_s_sb,
+					   SB_BUFFER_WITH_SB(p_s_sb));
+			do_journal_end(&myth, p_s_sb, 1, FLUSH_ALL);
+		}
+	}
+
+	reiserfs_mounted_fs_count--;
+	/* wait for all commits to finish */
+	cancel_delayed_work(&SB_JOURNAL(p_s_sb)->j_work);
+	flush_workqueue(commit_wq);
+	if (!reiserfs_mounted_fs_count) {
+		destroy_workqueue(commit_wq);
+		commit_wq = NULL;
+	}
+
+	free_journal_ram(p_s_sb);
+
+	return 0;
 }
 
 /*
 ** call on unmount.  flush all journal trans, release all alloc'd ram
 */
-int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
-  return do_journal_release(th, p_s_sb, 0) ;
+int journal_release(struct reiserfs_transaction_handle *th,
+		    struct super_block *p_s_sb)
+{
+	return do_journal_release(th, p_s_sb, 0);
 }
+
 /*
 ** only call from an error condition inside reiserfs_read_super!
 */
-int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) {
-  return do_journal_release(th, p_s_sb, 1) ;
+int journal_release_error(struct reiserfs_transaction_handle *th,
+			  struct super_block *p_s_sb)
+{
+	return do_journal_release(th, p_s_sb, 1);
 }
 
 /* compares description block with commit block.  returns 1 if they differ, 0 if they are the same */
-static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc, 
-			               struct reiserfs_journal_commit *commit) {
-  if (get_commit_trans_id (commit) != get_desc_trans_id (desc) || 
-      get_commit_trans_len (commit) != get_desc_trans_len (desc) || 
-      get_commit_trans_len (commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
-      get_commit_trans_len (commit) <= 0 
-  ) {
-    return 1 ;
-  }
-  return 0 ;
+static int journal_compare_desc_commit(struct super_block *p_s_sb,
+				       struct reiserfs_journal_desc *desc,
+				       struct reiserfs_journal_commit *commit)
+{
+	if (get_commit_trans_id(commit) != get_desc_trans_id(desc) ||
+	    get_commit_trans_len(commit) != get_desc_trans_len(desc) ||
+	    get_commit_trans_len(commit) > SB_JOURNAL(p_s_sb)->j_trans_max ||
+	    get_commit_trans_len(commit) <= 0) {
+		return 1;
+	}
+	return 0;
 }
+
 /* returns 0 if it did not find a description block  
 ** returns -1 if it found a corrupt commit block
 ** returns 1 if both desc and commit were valid 
 */
-static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) {
-  struct reiserfs_journal_desc *desc ;
-  struct reiserfs_journal_commit *commit ;
-  struct buffer_head *c_bh ;
-  unsigned long offset ;
-
-  if (!d_bh)
-      return 0 ;
-
-  desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
-  if (get_desc_trans_len(desc) > 0 && !memcmp(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8)) {
-    if (oldest_invalid_trans_id && *oldest_invalid_trans_id && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
-      reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction "
-	              "is valid returning because trans_id %d is greater than "
-		      "oldest_invalid %lu", get_desc_trans_id(desc),
-		       *oldest_invalid_trans_id);
-      return 0 ;
-    }
-    if (newest_mount_id && *newest_mount_id > get_desc_mount_id (desc)) {
-      reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction "
-                     "is valid returning because mount_id %d is less than "
-		     "newest_mount_id %lu", get_desc_mount_id (desc),
-		     *newest_mount_id) ;
-      return -1 ;
-    }
-    if ( get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max ) {
-      reiserfs_warning(p_s_sb, "journal-2018: Bad transaction length %d encountered, ignoring transaction", get_desc_trans_len(desc));
-      return -1 ;
-    }
-    offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
-
-    /* ok, we have a journal description block, lets see if the transaction was valid */
-    c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-		 ((offset + get_desc_trans_len(desc) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-    if (!c_bh)
-      return 0 ;
-    commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
-    if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-      reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, 
-                     "journal_transaction_is_valid, commit offset %ld had bad "
-		     "time %d or length %d",
-		     c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
-		     get_commit_trans_id (commit), 
-		     get_commit_trans_len(commit));
-      brelse(c_bh) ;
-      if (oldest_invalid_trans_id) {
-	*oldest_invalid_trans_id = get_desc_trans_id(desc) ;
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: "
-		       "transaction_is_valid setting oldest invalid trans_id "
-		       "to %d", get_desc_trans_id(desc)) ;
-      }
-      return -1; 
-    }
-    brelse(c_bh) ;
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
-                   "transaction start offset %llu, len %d id %d",
-		   d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-		   get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
-    return 1 ;
-  } else {
-    return 0 ;
-  }
-}
-
-static void brelse_array(struct buffer_head **heads, int num) {
-  int i ;
-  for (i = 0 ; i < num ; i++) {
-    brelse(heads[i]) ;
-  }
+static int journal_transaction_is_valid(struct super_block *p_s_sb,
+					struct buffer_head *d_bh,
+					unsigned long *oldest_invalid_trans_id,
+					unsigned long *newest_mount_id)
+{
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;
+	unsigned long offset;
+
+	if (!d_bh)
+		return 0;
+
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	if (get_desc_trans_len(desc) > 0
+	    && !memcmp(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8)) {
+		if (oldest_invalid_trans_id && *oldest_invalid_trans_id
+		    && get_desc_trans_id(desc) > *oldest_invalid_trans_id) {
+			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				       "journal-986: transaction "
+				       "is valid returning because trans_id %d is greater than "
+				       "oldest_invalid %lu",
+				       get_desc_trans_id(desc),
+				       *oldest_invalid_trans_id);
+			return 0;
+		}
+		if (newest_mount_id
+		    && *newest_mount_id > get_desc_mount_id(desc)) {
+			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				       "journal-1087: transaction "
+				       "is valid returning because mount_id %d is less than "
+				       "newest_mount_id %lu",
+				       get_desc_mount_id(desc),
+				       *newest_mount_id);
+			return -1;
+		}
+		if (get_desc_trans_len(desc) > SB_JOURNAL(p_s_sb)->j_trans_max) {
+			reiserfs_warning(p_s_sb,
+					 "journal-2018: Bad transaction length %d encountered, ignoring transaction",
+					 get_desc_trans_len(desc));
+			return -1;
+		}
+		offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+
+		/* ok, we have a journal description block, lets see if the transaction was valid */
+		c_bh =
+		    journal_bread(p_s_sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+				  ((offset + get_desc_trans_len(desc) +
+				    1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+		if (!c_bh)
+			return 0;
+		commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+		if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
+			reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+				       "journal_transaction_is_valid, commit offset %ld had bad "
+				       "time %d or length %d",
+				       c_bh->b_blocknr -
+				       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+				       get_commit_trans_id(commit),
+				       get_commit_trans_len(commit));
+			brelse(c_bh);
+			if (oldest_invalid_trans_id) {
+				*oldest_invalid_trans_id =
+				    get_desc_trans_id(desc);
+				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+					       "journal-1004: "
+					       "transaction_is_valid setting oldest invalid trans_id "
+					       "to %d",
+					       get_desc_trans_id(desc));
+			}
+			return -1;
+		}
+		brelse(c_bh);
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			       "journal-1006: found valid "
+			       "transaction start offset %llu, len %d id %d",
+			       d_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       get_desc_trans_len(desc),
+			       get_desc_trans_id(desc));
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void brelse_array(struct buffer_head **heads, int num)
+{
+	int i;
+	for (i = 0; i < num; i++) {
+		brelse(heads[i]);
+	}
 }
 
 /*
@@ -1873,149 +2018,202 @@ static void brelse_array(struct buffer_head **heads, int num) {
 ** this either reads in a replays a transaction, or returns because the transaction
 ** is invalid, or too old.
 */
-static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start, 
-				    unsigned long oldest_trans_id, unsigned long newest_mount_id) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_desc *desc ;
-  struct reiserfs_journal_commit *commit ;
-  unsigned long trans_id = 0 ;
-  struct buffer_head *c_bh ;
-  struct buffer_head *d_bh ;
-  struct buffer_head **log_blocks = NULL ;
-  struct buffer_head **real_blocks = NULL ;
-  unsigned long trans_offset ;
-  int i;
-  int trans_half;
-
-  d_bh = journal_bread(p_s_sb, cur_dblock) ;
-  if (!d_bh)
-    return 1 ;
-  desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
-  trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
-  reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
-                 "journal_read_transaction, offset %llu, len %d mount_id %d",
-		 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-		 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
-  if (get_desc_trans_id(desc) < oldest_trans_id) {
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
-                   "journal_read_trans skipping because %lu is too old",
-		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
-    brelse(d_bh) ;
-    return 1 ;
-  }
-  if (get_desc_mount_id(desc) != newest_mount_id) {
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
-                   "journal_read_trans skipping because %d is != "
-		   "newest_mount_id %lu", get_desc_mount_id(desc),
-		    newest_mount_id) ;
-    brelse(d_bh) ;
-    return 1 ;
-  }
-  c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
-		((trans_offset + get_desc_trans_len(desc) + 1) % 
-		 SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-  if (!c_bh) {
-    brelse(d_bh) ;
-    return 1 ;
-  }
-  commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
-  if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
-                   "commit offset %llu had bad time %d or length %d",
-		   c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-		   get_commit_trans_id(commit), get_commit_trans_len(commit));
-    brelse(c_bh) ;
-    brelse(d_bh) ;
-    return 1; 
-  }
-  trans_id = get_desc_trans_id(desc) ;
-  /* now we know we've got a good transaction, and it was inside the valid time ranges */
-  log_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
-  real_blocks = reiserfs_kmalloc(get_desc_trans_len(desc) * sizeof(struct buffer_head *), GFP_NOFS, p_s_sb) ;
-  if (!log_blocks  || !real_blocks) {
-    brelse(c_bh) ;
-    brelse(d_bh) ;
-    reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-    reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-    reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS") ;
-    return -1 ;
-  }
-  /* get all the buffer heads */
-  trans_half = journal_trans_half (p_s_sb->s_blocksize) ;
-  for(i = 0 ; i < get_desc_trans_len(desc) ; i++) {
-    log_blocks[i] =  journal_getblk(p_s_sb,  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + (trans_offset + 1 + i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
-    if (i < trans_half) {
-      real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(desc->j_realblock[i])) ;
-    } else {
-      real_blocks[i] = sb_getblk(p_s_sb, le32_to_cpu(commit->j_realblock[i - trans_half])) ;
-    }
-    if ( real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb) ) {
-      reiserfs_warning(p_s_sb, "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
-      goto abort_replay;
-    }
-    /* make sure we don't try to replay onto log or reserved area */
-    if (is_block_in_log_or_reserved_area(p_s_sb, real_blocks[i]->b_blocknr)) {
-      reiserfs_warning(p_s_sb, "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block") ;
-abort_replay:
-      brelse_array(log_blocks, i) ;
-      brelse_array(real_blocks, i) ;
-      brelse(c_bh) ;
-      brelse(d_bh) ;
-      reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      return -1 ;
-    }
-  }
-  /* read in the log blocks, memcpy to the corresponding real block */
-  ll_rw_block(READ, get_desc_trans_len(desc), log_blocks) ;
-  for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
-    wait_on_buffer(log_blocks[i]) ;
-    if (!buffer_uptodate(log_blocks[i])) {
-      reiserfs_warning(p_s_sb, "journal-1212: REPLAY FAILURE fsck required! buffer write failed") ;
-      brelse_array(log_blocks + i, get_desc_trans_len(desc) - i) ;
-      brelse_array(real_blocks, get_desc_trans_len(desc)) ;
-      brelse(c_bh) ;
-      brelse(d_bh) ;
-      reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      return -1 ;
-    }
-    memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ;
-    set_buffer_uptodate(real_blocks[i]) ;
-    brelse(log_blocks[i]) ;
-  }
-  /* flush out the real blocks */
-  for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
-    set_buffer_dirty(real_blocks[i]) ;
-    ll_rw_block(WRITE, 1, real_blocks + i) ;
-  }
-  for (i = 0 ; i < get_desc_trans_len(desc) ; i++) {
-    wait_on_buffer(real_blocks[i]) ; 
-    if (!buffer_uptodate(real_blocks[i])) {
-      reiserfs_warning(p_s_sb, "journal-1226: REPLAY FAILURE, fsck required! buffer write failed") ;
-      brelse_array(real_blocks + i, get_desc_trans_len(desc) - i) ;
-      brelse(c_bh) ;
-      brelse(d_bh) ;
-      reiserfs_kfree(log_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      reiserfs_kfree(real_blocks, get_desc_trans_len(desc) * sizeof(struct buffer_head *), p_s_sb) ;
-      return -1 ;
-    }
-    brelse(real_blocks[i]) ;
-  }
-  cur_dblock =  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((trans_offset + get_desc_trans_len(desc) + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
-  reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal "
-                 "start to offset %ld",
-		 cur_dblock -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)) ;
-  
-  /* init starting values for the first transaction, in case this is the last transaction to be replayed. */
-  journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
-  journal->j_last_flush_trans_id = trans_id ;
-  journal->j_trans_id = trans_id + 1;
-  brelse(c_bh) ;
-  brelse(d_bh) ;
-  reiserfs_kfree(log_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
-  reiserfs_kfree(real_blocks, le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), p_s_sb) ;
-  return 0 ;
+static int journal_read_transaction(struct super_block *p_s_sb,
+				    unsigned long cur_dblock,
+				    unsigned long oldest_start,
+				    unsigned long oldest_trans_id,
+				    unsigned long newest_mount_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	unsigned long trans_id = 0;
+	struct buffer_head *c_bh;
+	struct buffer_head *d_bh;
+	struct buffer_head **log_blocks = NULL;
+	struct buffer_head **real_blocks = NULL;
+	unsigned long trans_offset;
+	int i;
+	int trans_half;
+
+	d_bh = journal_bread(p_s_sb, cur_dblock);
+	if (!d_bh)
+		return 1;
+	desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+	trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
+		       "journal_read_transaction, offset %llu, len %d mount_id %d",
+		       d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		       get_desc_trans_len(desc), get_desc_mount_id(desc));
+	if (get_desc_trans_id(desc) < oldest_trans_id) {
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: "
+			       "journal_read_trans skipping because %lu is too old",
+			       cur_dblock -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+		brelse(d_bh);
+		return 1;
+	}
+	if (get_desc_mount_id(desc) != newest_mount_id) {
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: "
+			       "journal_read_trans skipping because %d is != "
+			       "newest_mount_id %lu", get_desc_mount_id(desc),
+			       newest_mount_id);
+		brelse(d_bh);
+		return 1;
+	}
+	c_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			     ((trans_offset + get_desc_trans_len(desc) + 1) %
+			      SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+	if (!c_bh) {
+		brelse(d_bh);
+		return 1;
+	}
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			       "journal_read_transaction, "
+			       "commit offset %llu had bad time %d or length %d",
+			       c_bh->b_blocknr -
+			       SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       get_commit_trans_id(commit),
+			       get_commit_trans_len(commit));
+		brelse(c_bh);
+		brelse(d_bh);
+		return 1;
+	}
+	trans_id = get_desc_trans_id(desc);
+	/* now we know we've got a good transaction, and it was inside the valid time ranges */
+	log_blocks =
+	    reiserfs_kmalloc(get_desc_trans_len(desc) *
+			     sizeof(struct buffer_head *), GFP_NOFS, p_s_sb);
+	real_blocks =
+	    reiserfs_kmalloc(get_desc_trans_len(desc) *
+			     sizeof(struct buffer_head *), GFP_NOFS, p_s_sb);
+	if (!log_blocks || !real_blocks) {
+		brelse(c_bh);
+		brelse(d_bh);
+		reiserfs_kfree(log_blocks,
+			       get_desc_trans_len(desc) *
+			       sizeof(struct buffer_head *), p_s_sb);
+		reiserfs_kfree(real_blocks,
+			       get_desc_trans_len(desc) *
+			       sizeof(struct buffer_head *), p_s_sb);
+		reiserfs_warning(p_s_sb,
+				 "journal-1169: kmalloc failed, unable to mount FS");
+		return -1;
+	}
+	/* get all the buffer heads */
+	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		log_blocks[i] =
+		    journal_getblk(p_s_sb,
+				   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+				   (trans_offset + 1 +
+				    i) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+		if (i < trans_half) {
+			real_blocks[i] =
+			    sb_getblk(p_s_sb,
+				      le32_to_cpu(desc->j_realblock[i]));
+		} else {
+			real_blocks[i] =
+			    sb_getblk(p_s_sb,
+				      le32_to_cpu(commit->
+						  j_realblock[i - trans_half]));
+		}
+		if (real_blocks[i]->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) {
+			reiserfs_warning(p_s_sb,
+					 "journal-1207: REPLAY FAILURE fsck required! Block to replay is outside of filesystem");
+			goto abort_replay;
+		}
+		/* make sure we don't try to replay onto log or reserved area */
+		if (is_block_in_log_or_reserved_area
+		    (p_s_sb, real_blocks[i]->b_blocknr)) {
+			reiserfs_warning(p_s_sb,
+					 "journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block");
+		      abort_replay:
+			brelse_array(log_blocks, i);
+			brelse_array(real_blocks, i);
+			brelse(c_bh);
+			brelse(d_bh);
+			reiserfs_kfree(log_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			reiserfs_kfree(real_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			return -1;
+		}
+	}
+	/* read in the log blocks, memcpy to the corresponding real block */
+	ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		wait_on_buffer(log_blocks[i]);
+		if (!buffer_uptodate(log_blocks[i])) {
+			reiserfs_warning(p_s_sb,
+					 "journal-1212: REPLAY FAILURE fsck required! buffer write failed");
+			brelse_array(log_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse_array(real_blocks, get_desc_trans_len(desc));
+			brelse(c_bh);
+			brelse(d_bh);
+			reiserfs_kfree(log_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			reiserfs_kfree(real_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			return -1;
+		}
+		memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data,
+		       real_blocks[i]->b_size);
+		set_buffer_uptodate(real_blocks[i]);
+		brelse(log_blocks[i]);
+	}
+	/* flush out the real blocks */
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		set_buffer_dirty(real_blocks[i]);
+		ll_rw_block(WRITE, 1, real_blocks + i);
+	}
+	for (i = 0; i < get_desc_trans_len(desc); i++) {
+		wait_on_buffer(real_blocks[i]);
+		if (!buffer_uptodate(real_blocks[i])) {
+			reiserfs_warning(p_s_sb,
+					 "journal-1226: REPLAY FAILURE, fsck required! buffer write failed");
+			brelse_array(real_blocks + i,
+				     get_desc_trans_len(desc) - i);
+			brelse(c_bh);
+			brelse(d_bh);
+			reiserfs_kfree(log_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			reiserfs_kfree(real_blocks,
+				       get_desc_trans_len(desc) *
+				       sizeof(struct buffer_head *), p_s_sb);
+			return -1;
+		}
+		brelse(real_blocks[i]);
+	}
+	cur_dblock =
+	    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+	    ((trans_offset + get_desc_trans_len(desc) +
+	      2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		       "journal-1095: setting journal " "start to offset %ld",
+		       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb));
+
+	/* init starting values for the first transaction, in case this is the last transaction to be replayed. */
+	journal->j_start = cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+	journal->j_last_flush_trans_id = trans_id;
+	journal->j_trans_id = trans_id + 1;
+	brelse(c_bh);
+	brelse(d_bh);
+	reiserfs_kfree(log_blocks,
+		       le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *),
+		       p_s_sb);
+	reiserfs_kfree(real_blocks,
+		       le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *),
+		       p_s_sb);
+	return 0;
 }
 
 /* This function reads blocks starting from block and to max_block of bufsize
@@ -2024,39 +2222,39 @@ abort_replay:
    Right now it is only used from journal code. But later we might use it
    from other places.
    Note: Do not use journal_getblk/sb_getblk functions here! */
-static struct buffer_head * reiserfs_breada (struct block_device *dev, int block, int bufsize,
-			    unsigned int max_block)
+static struct buffer_head *reiserfs_breada(struct block_device *dev, int block,
+					   int bufsize, unsigned int max_block)
 {
-	struct buffer_head * bhlist[BUFNR];
+	struct buffer_head *bhlist[BUFNR];
 	unsigned int blocks = BUFNR;
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	int i, j;
-	
-	bh = __getblk (dev, block, bufsize );
-	if (buffer_uptodate (bh))
-		return (bh);   
-		
+
+	bh = __getblk(dev, block, bufsize);
+	if (buffer_uptodate(bh))
+		return (bh);
+
 	if (block + BUFNR > max_block) {
 		blocks = max_block - block;
 	}
 	bhlist[0] = bh;
 	j = 1;
 	for (i = 1; i < blocks; i++) {
-		bh = __getblk (dev, block + i, bufsize);
-		if (buffer_uptodate (bh)) {
-			brelse (bh);
+		bh = __getblk(dev, block + i, bufsize);
+		if (buffer_uptodate(bh)) {
+			brelse(bh);
 			break;
-		}
-		else bhlist[j++] = bh;
+		} else
+			bhlist[j++] = bh;
 	}
-	ll_rw_block (READ, j, bhlist);
-	for(i = 1; i < j; i++) 
-		brelse (bhlist[i]);
+	ll_rw_block(READ, j, bhlist);
+	for (i = 1; i < j; i++)
+		brelse(bhlist[i]);
 	bh = bhlist[0];
-	wait_on_buffer (bh);
-	if (buffer_uptodate (bh))
+	wait_on_buffer(bh);
+	if (buffer_uptodate(bh))
 		return bh;
-	brelse (bh);
+	brelse(bh);
 	return NULL;
 }
 
@@ -2069,218 +2267,250 @@ static struct buffer_head * reiserfs_breada (struct block_device *dev, int block
 **
 ** On exit, it sets things up so the first transaction will work correctly.
 */
-static int journal_read(struct super_block *p_s_sb) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_desc *desc ;
-  unsigned long oldest_trans_id = 0;
-  unsigned long oldest_invalid_trans_id = 0 ;
-  time_t start ;
-  unsigned long oldest_start = 0;
-  unsigned long cur_dblock = 0 ;
-  unsigned long newest_mount_id = 9 ;
-  struct buffer_head *d_bh ;
-  struct reiserfs_journal_header *jh ;
-  int valid_journal_header = 0 ;
-  int replay_count = 0 ;
-  int continue_replay = 1 ;
-  int ret ;
-  char b[BDEVNAME_SIZE];
-
-  cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
-  reiserfs_info (p_s_sb, "checking transaction log (%s)\n",
-	 bdevname(journal->j_dev_bd, b));
-  start = get_seconds();
-
-  /* step 1, read in the journal header block.  Check the transaction it says 
-  ** is the first unflushed, and if that transaction is not valid, 
-  ** replay is done
-  */
-  journal->j_header_bh = journal_bread(p_s_sb,
-					   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
-					   SB_ONDISK_JOURNAL_SIZE(p_s_sb));
-  if (!journal->j_header_bh) {
-    return 1 ;
-  }
-  jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data) ;
-  if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && 
-      le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && 
-      le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
-    oldest_start = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
-                       le32_to_cpu(jh->j_first_unflushed_offset) ;
-    oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-    newest_mount_id = le32_to_cpu(jh->j_mount_id);
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in "
-                   "header: first_unflushed_offset %d, last_flushed_trans_id "
-		   "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
-		   le32_to_cpu(jh->j_last_flush_trans_id)) ;
-    valid_journal_header = 1 ;
-
-    /* now, we try to read the first unflushed offset.  If it is not valid, 
-    ** there is nothing more we can do, and it makes no sense to read 
-    ** through the whole log.
-    */
-    d_bh = journal_bread(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset)) ;
-    ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ;
-    if (!ret) {
-      continue_replay = 0 ;
-    }
-    brelse(d_bh) ;
-    goto start_log_replay;
-  }
-
-  if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
-    reiserfs_warning (p_s_sb,
-		      "clm-2076: device is readonly, unable to replay log") ;
-    return -1 ;
-  }
-
-  /* ok, there are transactions that need to be replayed.  start with the first log block, find
-  ** all the valid transactions, and pick out the oldest.
-  */
-  while(continue_replay && cur_dblock < (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
-    /* Note that it is required for blocksize of primary fs device and journal
-       device to be the same */
-    d_bh = reiserfs_breada(journal->j_dev_bd, cur_dblock, p_s_sb->s_blocksize,
-			   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) ;
-    ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ;
-    if (ret == 1) {
-      desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
-      if (oldest_start == 0) { /* init all oldest_ values */
-        oldest_trans_id = get_desc_trans_id(desc) ;
-	oldest_start = d_bh->b_blocknr ;
-	newest_mount_id = get_desc_mount_id(desc) ;
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
-	               "oldest_start to offset %llu, trans_id %lu",
-		       oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-		       oldest_trans_id) ;
-      } else if (oldest_trans_id > get_desc_trans_id(desc)) { 
-        /* one we just read was older */
-        oldest_trans_id = get_desc_trans_id(desc) ;
-	oldest_start = d_bh->b_blocknr ;
-	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting "
-	               "oldest_start to offset %lu, trans_id %lu",
-			oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-			oldest_trans_id) ;
-      }
-      if (newest_mount_id < get_desc_mount_id(desc)) {
-        newest_mount_id = get_desc_mount_id(desc) ;
+static int journal_read(struct super_block *p_s_sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_desc *desc;
+	unsigned long oldest_trans_id = 0;
+	unsigned long oldest_invalid_trans_id = 0;
+	time_t start;
+	unsigned long oldest_start = 0;
+	unsigned long cur_dblock = 0;
+	unsigned long newest_mount_id = 9;
+	struct buffer_head *d_bh;
+	struct reiserfs_journal_header *jh;
+	int valid_journal_header = 0;
+	int replay_count = 0;
+	int continue_replay = 1;
+	int ret;
+	char b[BDEVNAME_SIZE];
+
+	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb);
+	reiserfs_info(p_s_sb, "checking transaction log (%s)\n",
+		      bdevname(journal->j_dev_bd, b));
+	start = get_seconds();
+
+	/* step 1, read in the journal header block.  Check the transaction it says 
+	 ** is the first unflushed, and if that transaction is not valid, 
+	 ** replay is done
+	 */
+	journal->j_header_bh = journal_bread(p_s_sb,
+					     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb)
+					     + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	if (!journal->j_header_bh) {
+		return 1;
+	}
+	jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data);
+	if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 &&
+	    le32_to_cpu(jh->j_first_unflushed_offset) <
+	    SB_ONDISK_JOURNAL_SIZE(p_s_sb)
+	    && le32_to_cpu(jh->j_last_flush_trans_id) > 0) {
+		oldest_start =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		    le32_to_cpu(jh->j_first_unflushed_offset);
+		oldest_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		newest_mount_id = le32_to_cpu(jh->j_mount_id);
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			       "journal-1153: found in "
+			       "header: first_unflushed_offset %d, last_flushed_trans_id "
+			       "%lu", le32_to_cpu(jh->j_first_unflushed_offset),
+			       le32_to_cpu(jh->j_last_flush_trans_id));
+		valid_journal_header = 1;
+
+		/* now, we try to read the first unflushed offset.  If it is not valid, 
+		 ** there is nothing more we can do, and it makes no sense to read 
+		 ** through the whole log.
+		 */
+		d_bh =
+		    journal_bread(p_s_sb,
+				  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+				  le32_to_cpu(jh->j_first_unflushed_offset));
+		ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL);
+		if (!ret) {
+			continue_replay = 0;
+		}
+		brelse(d_bh);
+		goto start_log_replay;
+	}
+
+	if (continue_replay && bdev_read_only(p_s_sb->s_bdev)) {
+		reiserfs_warning(p_s_sb,
+				 "clm-2076: device is readonly, unable to replay log");
+		return -1;
+	}
+
+	/* ok, there are transactions that need to be replayed.  start with the first log block, find
+	 ** all the valid transactions, and pick out the oldest.
+	 */
+	while (continue_replay
+	       && cur_dblock <
+	       (SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb))) {
+		/* Note that it is required for blocksize of primary fs device and journal
+		   device to be the same */
+		d_bh =
+		    reiserfs_breada(journal->j_dev_bd, cur_dblock,
+				    p_s_sb->s_blocksize,
+				    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+				    SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+		ret =
+		    journal_transaction_is_valid(p_s_sb, d_bh,
+						 &oldest_invalid_trans_id,
+						 &newest_mount_id);
+		if (ret == 1) {
+			desc = (struct reiserfs_journal_desc *)d_bh->b_data;
+			if (oldest_start == 0) {	/* init all oldest_ values */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+					       "journal-1179: Setting "
+					       "oldest_start to offset %llu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (p_s_sb), oldest_trans_id);
+			} else if (oldest_trans_id > get_desc_trans_id(desc)) {
+				/* one we just read was older */
+				oldest_trans_id = get_desc_trans_id(desc);
+				oldest_start = d_bh->b_blocknr;
+				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+					       "journal-1180: Resetting "
+					       "oldest_start to offset %lu, trans_id %lu",
+					       oldest_start -
+					       SB_ONDISK_JOURNAL_1st_BLOCK
+					       (p_s_sb), oldest_trans_id);
+			}
+			if (newest_mount_id < get_desc_mount_id(desc)) {
+				newest_mount_id = get_desc_mount_id(desc);
+				reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+					       "journal-1299: Setting "
+					       "newest_mount_id to %d",
+					       get_desc_mount_id(desc));
+			}
+			cur_dblock += get_desc_trans_len(desc) + 2;
+		} else {
+			cur_dblock++;
+		}
+		brelse(d_bh);
+	}
+
+      start_log_replay:
+	cur_dblock = oldest_start;
+	if (oldest_trans_id) {
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			       "journal-1206: Starting replay "
+			       "from offset %llu, trans_id %lu",
+			       cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+			       oldest_trans_id);
+
+	}
+	replay_count = 0;
+	while (continue_replay && oldest_trans_id > 0) {
+		ret =
+		    journal_read_transaction(p_s_sb, cur_dblock, oldest_start,
+					     oldest_trans_id, newest_mount_id);
+		if (ret < 0) {
+			return ret;
+		} else if (ret != 0) {
+			break;
+		}
+		cur_dblock =
+		    SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start;
+		replay_count++;
+		if (cur_dblock == oldest_start)
+			break;
+	}
+
+	if (oldest_trans_id == 0) {
+		reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+			       "journal-1225: No valid " "transactions found");
+	}
+	/* j_start does not get set correctly if we don't replay any transactions.
+	 ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
+	 ** copy the trans_id from the header
+	 */
+	if (valid_journal_header && replay_count == 0) {
+		journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset);
+		journal->j_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id) + 1;
+		journal->j_last_flush_trans_id =
+		    le32_to_cpu(jh->j_last_flush_trans_id);
+		journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
+	} else {
+		journal->j_mount_id = newest_mount_id + 1;
+	}
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
-	              "newest_mount_id to %d", get_desc_mount_id(desc));
-      }
-      cur_dblock += get_desc_trans_len(desc) + 2 ;
-    } else {
-      cur_dblock++ ;
-    }
-    brelse(d_bh) ;
-  }
-
-start_log_replay:
-  cur_dblock = oldest_start ;
-  if (oldest_trans_id)  {
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
-                   "from offset %llu, trans_id %lu",
-		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
-		   oldest_trans_id) ;
-
-  }
-  replay_count = 0 ;
-  while(continue_replay && oldest_trans_id > 0) {
-    ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ;
-    if (ret < 0) {
-      return ret ;
-    } else if (ret != 0) {
-      break ;
-    }
-    cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start ;
-    replay_count++ ;
-   if (cur_dblock == oldest_start)
-        break;
-  }
-
-  if (oldest_trans_id == 0) {
-    reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid "
-                   "transactions found") ;
-  }
-  /* j_start does not get set correctly if we don't replay any transactions.
-  ** if we had a valid journal_header, set j_start to the first unflushed transaction value,
-  ** copy the trans_id from the header
-  */
-  if (valid_journal_header && replay_count == 0) { 
-    journal->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ;
-    journal->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1;
-    journal->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ;
-    journal->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1;
-  } else {
-    journal->j_mount_id = newest_mount_id + 1 ;
-  }
-  reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting "
-                 "newest_mount_id to %lu", journal->j_mount_id) ;
-  journal->j_first_unflushed_offset = journal->j_start ;
-  if (replay_count > 0) {
-    reiserfs_info (p_s_sb, "replayed %d transactions in %lu seconds\n",
-		   replay_count, get_seconds() - start) ;
-  }
-  if (!bdev_read_only(p_s_sb->s_bdev) && 
-       _update_journal_header_block(p_s_sb, journal->j_start,
-                                   journal->j_last_flush_trans_id))
-  {
-      /* replay failed, caller must call free_journal_ram and abort
-      ** the mount
-      */
-      return -1 ;
-  }
-  return 0 ;
+		       "newest_mount_id to %lu", journal->j_mount_id);
+	journal->j_first_unflushed_offset = journal->j_start;
+	if (replay_count > 0) {
+		reiserfs_info(p_s_sb,
+			      "replayed %d transactions in %lu seconds\n",
+			      replay_count, get_seconds() - start);
+	}
+	if (!bdev_read_only(p_s_sb->s_bdev) &&
+	    _update_journal_header_block(p_s_sb, journal->j_start,
+					 journal->j_last_flush_trans_id)) {
+		/* replay failed, caller must call free_journal_ram and abort
+		 ** the mount
+		 */
+		return -1;
+	}
+	return 0;
 }
 
 static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
 {
-    struct reiserfs_journal_list *jl;
-retry:
-    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
-    if (!jl) {
-	yield();
-	goto retry;
-    }
-    memset(jl, 0, sizeof(*jl));
-    INIT_LIST_HEAD(&jl->j_list);
-    INIT_LIST_HEAD(&jl->j_working_list);
-    INIT_LIST_HEAD(&jl->j_tail_bh_list);
-    INIT_LIST_HEAD(&jl->j_bh_list);
-    sema_init(&jl->j_commit_lock, 1);
-    SB_JOURNAL(s)->j_num_lists++;
-    get_journal_list(jl);
-    return jl;
-}
-
-static void journal_list_init(struct super_block *p_s_sb) {
-    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
-}
-
-static int release_journal_dev( struct super_block *super,
-				struct reiserfs_journal *journal )
-{
-    int result;
-    
-    result = 0;
-
-    if( journal -> j_dev_file != NULL ) {
-	result = filp_close( journal -> j_dev_file, NULL );
-	journal -> j_dev_file = NULL;
-	journal -> j_dev_bd = NULL;
-    } else if( journal -> j_dev_bd != NULL ) {
-	result = blkdev_put( journal -> j_dev_bd );
-	journal -> j_dev_bd = NULL;
-    }
-
-    if( result != 0 ) {
-	reiserfs_warning(super, "sh-457: release_journal_dev: Cannot release journal device: %i", result );
-    }
-    return result;
-}
-
-static int journal_init_dev( struct super_block *super, 
-			     struct reiserfs_journal *journal, 
-			     const char *jdev_name )
+	struct reiserfs_journal_list *jl;
+      retry:
+	jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS,
+			      s);
+	if (!jl) {
+		yield();
+		goto retry;
+	}
+	memset(jl, 0, sizeof(*jl));
+	INIT_LIST_HEAD(&jl->j_list);
+	INIT_LIST_HEAD(&jl->j_working_list);
+	INIT_LIST_HEAD(&jl->j_tail_bh_list);
+	INIT_LIST_HEAD(&jl->j_bh_list);
+	sema_init(&jl->j_commit_lock, 1);
+	SB_JOURNAL(s)->j_num_lists++;
+	get_journal_list(jl);
+	return jl;
+}
+
+static void journal_list_init(struct super_block *p_s_sb)
+{
+	SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
+}
+
+static int release_journal_dev(struct super_block *super,
+			       struct reiserfs_journal *journal)
+{
+	int result;
+
+	result = 0;
+
+	if (journal->j_dev_file != NULL) {
+		result = filp_close(journal->j_dev_file, NULL);
+		journal->j_dev_file = NULL;
+		journal->j_dev_bd = NULL;
+	} else if (journal->j_dev_bd != NULL) {
+		result = blkdev_put(journal->j_dev_bd);
+		journal->j_dev_bd = NULL;
+	}
+
+	if (result != 0) {
+		reiserfs_warning(super,
+				 "sh-457: release_journal_dev: Cannot release journal device: %i",
+				 result);
+	}
+	return result;
+}
+
+static int journal_init_dev(struct super_block *super,
+			    struct reiserfs_journal *journal,
+			    const char *jdev_name)
 {
 	int result;
 	dev_t jdev;
@@ -2289,50 +2519,51 @@ static int journal_init_dev( struct super_block *super,
 
 	result = 0;
 
-	journal -> j_dev_bd = NULL;
-	journal -> j_dev_file = NULL;
-	jdev = SB_ONDISK_JOURNAL_DEVICE( super ) ?
-		new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;	
+	journal->j_dev_bd = NULL;
+	journal->j_dev_file = NULL;
+	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
+	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
 	if (bdev_read_only(super->s_bdev))
-	    blkdev_mode = FMODE_READ;
+		blkdev_mode = FMODE_READ;
 
 	/* there is no "jdev" option and journal is on separate device */
-	if( ( !jdev_name || !jdev_name[ 0 ] ) ) {
+	if ((!jdev_name || !jdev_name[0])) {
 		journal->j_dev_bd = open_by_devnum(jdev, blkdev_mode);
 		if (IS_ERR(journal->j_dev_bd)) {
 			result = PTR_ERR(journal->j_dev_bd);
 			journal->j_dev_bd = NULL;
-			reiserfs_warning (super, "sh-458: journal_init_dev: "
-					  "cannot init journal device '%s': %i",
-					  __bdevname(jdev, b), result );
+			reiserfs_warning(super, "sh-458: journal_init_dev: "
+					 "cannot init journal device '%s': %i",
+					 __bdevname(jdev, b), result);
 			return result;
 		} else if (jdev != super->s_dev)
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
 		return 0;
 	}
 
-	journal -> j_dev_file = filp_open( jdev_name, 0, 0 );
-	if( !IS_ERR( journal -> j_dev_file ) ) {
+	journal->j_dev_file = filp_open(jdev_name, 0, 0);
+	if (!IS_ERR(journal->j_dev_file)) {
 		struct inode *jdev_inode = journal->j_dev_file->f_mapping->host;
-		if( !S_ISBLK( jdev_inode -> i_mode ) ) {
+		if (!S_ISBLK(jdev_inode->i_mode)) {
 			reiserfs_warning(super, "journal_init_dev: '%s' is "
-					 "not a block device", jdev_name );
+					 "not a block device", jdev_name);
 			result = -ENOTBLK;
-			release_journal_dev( super, journal );
-		} else  {
+			release_journal_dev(super, journal);
+		} else {
 			/* ok */
 			journal->j_dev_bd = I_BDEV(jdev_inode);
 			set_blocksize(journal->j_dev_bd, super->s_blocksize);
-			reiserfs_info(super, "journal_init_dev: journal device: %s\n",
+			reiserfs_info(super,
+				      "journal_init_dev: journal device: %s\n",
 				      bdevname(journal->j_dev_bd, b));
 		}
 	} else {
-		result = PTR_ERR( journal -> j_dev_file );
-		journal -> j_dev_file = NULL;
-		reiserfs_warning (super,
-				  "journal_init_dev: Cannot open '%s': %i",
-				  jdev_name, result );
+		result = PTR_ERR(journal->j_dev_file);
+		journal->j_dev_file = NULL;
+		reiserfs_warning(super,
+				 "journal_init_dev: Cannot open '%s': %i",
+				 jdev_name, result);
 	}
 	return result;
 }
@@ -2340,193 +2571,214 @@ static int journal_init_dev( struct super_block *super,
 /*
 ** must be called once on fs mount.  calls journal_read for you
 */
-int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_format, unsigned int commit_max_age) {
-    int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2 ;
-    struct buffer_head *bhjh;
-    struct reiserfs_super_block * rs;
-    struct reiserfs_journal_header *jh;
-    struct reiserfs_journal *journal;
-    struct reiserfs_journal_list *jl;
-    char b[BDEVNAME_SIZE];
-
-    journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
-    if (!journal) {
-	reiserfs_warning (p_s_sb, "journal-1256: unable to get memory for journal structure") ;
-	return 1 ;
-    }
-    memset(journal, 0, sizeof(struct reiserfs_journal)) ;
-    INIT_LIST_HEAD(&journal->j_bitmap_nodes) ;
-    INIT_LIST_HEAD (&journal->j_prealloc_list);
-    INIT_LIST_HEAD(&journal->j_working_list);
-    INIT_LIST_HEAD(&journal->j_journal_list);
-    journal->j_persistent_trans = 0;
-    if (reiserfs_allocate_list_bitmaps(p_s_sb,
-				       journal->j_list_bitmap,
- 				       SB_BMAP_NR(p_s_sb)))
-	goto free_and_return ;
-    allocate_bitmap_nodes(p_s_sb) ;
-
-    /* reserved for journal area support */
-    SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
-					     REISERFS_OLD_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize +
-					     SB_BMAP_NR(p_s_sb) + 1 :
-					     REISERFS_DISK_OFFSET_IN_BYTES / p_s_sb->s_blocksize + 2); 
-    
-    /* Sanity check to see is the standard journal fitting withing first bitmap
-       (actual for small blocksizes) */
-    if ( !SB_ONDISK_JOURNAL_DEVICE( p_s_sb ) &&
-         (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8) ) {
-	reiserfs_warning (p_s_sb, "journal-1393: journal does not fit for area "
-			  "addressed by first of bitmap blocks. It starts at "
-			  "%u and its size is %u. Block size %ld",
-			  SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
-			  SB_ONDISK_JOURNAL_SIZE(p_s_sb), p_s_sb->s_blocksize);
-	goto free_and_return;
-    }
-
-    if( journal_init_dev( p_s_sb, journal, j_dev_name ) != 0 ) {
-      reiserfs_warning (p_s_sb, "sh-462: unable to initialize jornal device");
-      goto free_and_return;
-    }
-
-     rs = SB_DISK_SUPER_BLOCK(p_s_sb);
-     
-     /* read journal header */
-     bhjh = journal_bread(p_s_sb,
-		   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_ONDISK_JOURNAL_SIZE(p_s_sb));
-     if (!bhjh) {
-	 reiserfs_warning (p_s_sb, "sh-459: unable to read journal header");
-	 goto free_and_return;
-     }
-     jh = (struct reiserfs_journal_header *)(bhjh->b_data);
-     
-     /* make sure that journal matches to the super block */
-     if (is_reiserfs_jr(rs) && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != sb_jp_journal_magic(rs))) {
-	 reiserfs_warning (p_s_sb, "sh-460: journal header magic %x "
-			   "(device %s) does not match to magic found in super "
-			   "block %x",
-			   jh->jh_journal.jp_journal_magic,
-			   bdevname( journal->j_dev_bd, b),
-			   sb_jp_journal_magic(rs));
-	 brelse (bhjh);
-	 goto free_and_return;
-  }
-     
-  journal->j_trans_max      = le32_to_cpu (jh->jh_journal.jp_journal_trans_max);
-  journal->j_max_batch      = le32_to_cpu (jh->jh_journal.jp_journal_max_batch);
-  journal->j_max_commit_age = le32_to_cpu (jh->jh_journal.jp_journal_max_commit_age);
-  journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-
-  if (journal->j_trans_max) {
-    /* make sure these parameters are available, assign it if they are not */
-    __u32 initial = journal->j_trans_max;
-    __u32 ratio = 1;
-    
-    if (p_s_sb->s_blocksize < 4096)
-      ratio = 4096 / p_s_sb->s_blocksize;
-    
-    if (SB_ONDISK_JOURNAL_SIZE(p_s_sb)/journal->j_trans_max < JOURNAL_MIN_RATIO)
-      journal->j_trans_max = SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;
-    if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)
-      journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT / ratio;
-    if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)
-      journal->j_trans_max = JOURNAL_TRANS_MIN_DEFAULT / ratio;
-    
-    if (journal->j_trans_max != initial)
-      reiserfs_warning (p_s_sb, "sh-461: journal_init: wrong transaction max size (%u). Changed to %u",
-	      initial, journal->j_trans_max);
-
-    journal->j_max_batch = journal->j_trans_max*
-      JOURNAL_MAX_BATCH_DEFAULT/JOURNAL_TRANS_MAX_DEFAULT;
-  }  
-  
-  if (!journal->j_trans_max) {
-    /*we have the file system was created by old version of mkreiserfs 
-      so this field contains zero value */
-    journal->j_trans_max      = JOURNAL_TRANS_MAX_DEFAULT ;
-    journal->j_max_batch      = JOURNAL_MAX_BATCH_DEFAULT ;
-    journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE ;
-    
-    /* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096
-       trans max size is decreased proportionally */
-    if (p_s_sb->s_blocksize < 4096) {
-      journal->j_trans_max /= (4096 / p_s_sb->s_blocksize) ;
-      journal->j_max_batch = (journal->j_trans_max) * 9 / 10 ;
-    }
-  }
-
-  journal->j_default_max_commit_age = journal->j_max_commit_age;
-
-  if (commit_max_age != 0) {
-      journal->j_max_commit_age = commit_max_age;
-      journal->j_max_trans_age = commit_max_age;
-  }
-
-  reiserfs_info (p_s_sb, "journal params: device %s, size %u, "
-		 "journal first block %u, max trans len %u, max batch %u, "
-		 "max commit age %u, max trans age %u\n",
-		 bdevname( journal->j_dev_bd, b),
-		 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
-		 SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
-		 journal->j_trans_max,
-		 journal->j_max_batch,
-		 journal->j_max_commit_age,
-		 journal->j_max_trans_age);
-
-  brelse (bhjh);
-     
-  journal->j_list_bitmap_index = 0 ;
-  journal_list_init(p_s_sb) ;
-
-  memset(journal->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
-
-  INIT_LIST_HEAD(&journal->j_dirty_buffers) ;
-  spin_lock_init(&journal->j_dirty_buffers_lock) ;
-
-  journal->j_start = 0 ;
-  journal->j_len = 0 ;
-  journal->j_len_alloc = 0 ;
-  atomic_set(&(journal->j_wcount), 0) ;
-  atomic_set(&(journal->j_async_throttle), 0) ;
-  journal->j_bcount = 0 ;
-  journal->j_trans_start_time = 0 ;
-  journal->j_last = NULL ;
-  journal->j_first = NULL ;
-  init_waitqueue_head(&(journal->j_join_wait)) ;
-  sema_init(&journal->j_lock, 1);
-  sema_init(&journal->j_flush_sem, 1);
-
-  journal->j_trans_id = 10 ;
-  journal->j_mount_id = 10 ;
-  journal->j_state = 0 ;
-  atomic_set(&(journal->j_jlock), 0) ;
-  journal->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
-  journal->j_cnode_free_orig = journal->j_cnode_free_list ;
-  journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0 ;
-  journal->j_cnode_used = 0 ;
-  journal->j_must_wait = 0 ;
-
-  init_journal_hash(p_s_sb) ;
-  jl = journal->j_current_jl;
-  jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
-  if (!jl->j_list_bitmap) {
-    reiserfs_warning(p_s_sb, "journal-2005, get_list_bitmap failed for journal list 0") ;
-    goto free_and_return;
-  }
-  if (journal_read(p_s_sb) < 0) {
-    reiserfs_warning(p_s_sb, "Replay Failure, unable to mount") ;
-    goto free_and_return;
-  }
-
-  reiserfs_mounted_fs_count++ ;
-  if (reiserfs_mounted_fs_count <= 1)
-    commit_wq = create_workqueue("reiserfs");
-
-  INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
-  return 0 ;
-free_and_return:
-  free_journal_ram(p_s_sb);
-  return 1;
+int journal_init(struct super_block *p_s_sb, const char *j_dev_name,
+		 int old_format, unsigned int commit_max_age)
+{
+	int num_cnodes = SB_ONDISK_JOURNAL_SIZE(p_s_sb) * 2;
+	struct buffer_head *bhjh;
+	struct reiserfs_super_block *rs;
+	struct reiserfs_journal_header *jh;
+	struct reiserfs_journal *journal;
+	struct reiserfs_journal_list *jl;
+	char b[BDEVNAME_SIZE];
+
+	journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof(struct reiserfs_journal));
+	if (!journal) {
+		reiserfs_warning(p_s_sb,
+				 "journal-1256: unable to get memory for journal structure");
+		return 1;
+	}
+	memset(journal, 0, sizeof(struct reiserfs_journal));
+	INIT_LIST_HEAD(&journal->j_bitmap_nodes);
+	INIT_LIST_HEAD(&journal->j_prealloc_list);
+	INIT_LIST_HEAD(&journal->j_working_list);
+	INIT_LIST_HEAD(&journal->j_journal_list);
+	journal->j_persistent_trans = 0;
+	if (reiserfs_allocate_list_bitmaps(p_s_sb,
+					   journal->j_list_bitmap,
+					   SB_BMAP_NR(p_s_sb)))
+		goto free_and_return;
+	allocate_bitmap_nodes(p_s_sb);
+
+	/* reserved for journal area support */
+	SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) = (old_format ?
+						 REISERFS_OLD_DISK_OFFSET_IN_BYTES
+						 / p_s_sb->s_blocksize +
+						 SB_BMAP_NR(p_s_sb) +
+						 1 :
+						 REISERFS_DISK_OFFSET_IN_BYTES /
+						 p_s_sb->s_blocksize + 2);
+
+	/* Sanity check to see is the standard journal fitting withing first bitmap
+	   (actual for small blocksizes) */
+	if (!SB_ONDISK_JOURNAL_DEVICE(p_s_sb) &&
+	    (SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb) +
+	     SB_ONDISK_JOURNAL_SIZE(p_s_sb) > p_s_sb->s_blocksize * 8)) {
+		reiserfs_warning(p_s_sb,
+				 "journal-1393: journal does not fit for area "
+				 "addressed by first of bitmap blocks. It starts at "
+				 "%u and its size is %u. Block size %ld",
+				 SB_JOURNAL_1st_RESERVED_BLOCK(p_s_sb),
+				 SB_ONDISK_JOURNAL_SIZE(p_s_sb),
+				 p_s_sb->s_blocksize);
+		goto free_and_return;
+	}
+
+	if (journal_init_dev(p_s_sb, journal, j_dev_name) != 0) {
+		reiserfs_warning(p_s_sb,
+				 "sh-462: unable to initialize jornal device");
+		goto free_and_return;
+	}
+
+	rs = SB_DISK_SUPER_BLOCK(p_s_sb);
+
+	/* read journal header */
+	bhjh = journal_bread(p_s_sb,
+			     SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			     SB_ONDISK_JOURNAL_SIZE(p_s_sb));
+	if (!bhjh) {
+		reiserfs_warning(p_s_sb,
+				 "sh-459: unable to read journal header");
+		goto free_and_return;
+	}
+	jh = (struct reiserfs_journal_header *)(bhjh->b_data);
+
+	/* make sure that journal matches to the super block */
+	if (is_reiserfs_jr(rs)
+	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
+		sb_jp_journal_magic(rs))) {
+		reiserfs_warning(p_s_sb,
+				 "sh-460: journal header magic %x "
+				 "(device %s) does not match to magic found in super "
+				 "block %x", jh->jh_journal.jp_journal_magic,
+				 bdevname(journal->j_dev_bd, b),
+				 sb_jp_journal_magic(rs));
+		brelse(bhjh);
+		goto free_and_return;
+	}
+
+	journal->j_trans_max = le32_to_cpu(jh->jh_journal.jp_journal_trans_max);
+	journal->j_max_batch = le32_to_cpu(jh->jh_journal.jp_journal_max_batch);
+	journal->j_max_commit_age =
+	    le32_to_cpu(jh->jh_journal.jp_journal_max_commit_age);
+	journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+
+	if (journal->j_trans_max) {
+		/* make sure these parameters are available, assign it if they are not */
+		__u32 initial = journal->j_trans_max;
+		__u32 ratio = 1;
+
+		if (p_s_sb->s_blocksize < 4096)
+			ratio = 4096 / p_s_sb->s_blocksize;
+
+		if (SB_ONDISK_JOURNAL_SIZE(p_s_sb) / journal->j_trans_max <
+		    JOURNAL_MIN_RATIO)
+			journal->j_trans_max =
+			    SB_ONDISK_JOURNAL_SIZE(p_s_sb) / JOURNAL_MIN_RATIO;
+		if (journal->j_trans_max > JOURNAL_TRANS_MAX_DEFAULT / ratio)
+			journal->j_trans_max =
+			    JOURNAL_TRANS_MAX_DEFAULT / ratio;
+		if (journal->j_trans_max < JOURNAL_TRANS_MIN_DEFAULT / ratio)
+			journal->j_trans_max =
+			    JOURNAL_TRANS_MIN_DEFAULT / ratio;
+
+		if (journal->j_trans_max != initial)
+			reiserfs_warning(p_s_sb,
+					 "sh-461: journal_init: wrong transaction max size (%u). Changed to %u",
+					 initial, journal->j_trans_max);
+
+		journal->j_max_batch = journal->j_trans_max *
+		    JOURNAL_MAX_BATCH_DEFAULT / JOURNAL_TRANS_MAX_DEFAULT;
+	}
+
+	if (!journal->j_trans_max) {
+		/*we have the file system was created by old version of mkreiserfs 
+		   so this field contains zero value */
+		journal->j_trans_max = JOURNAL_TRANS_MAX_DEFAULT;
+		journal->j_max_batch = JOURNAL_MAX_BATCH_DEFAULT;
+		journal->j_max_commit_age = JOURNAL_MAX_COMMIT_AGE;
+
+		/* for blocksize >= 4096 - max transaction size is 1024. For block size < 4096
+		   trans max size is decreased proportionally */
+		if (p_s_sb->s_blocksize < 4096) {
+			journal->j_trans_max /= (4096 / p_s_sb->s_blocksize);
+			journal->j_max_batch = (journal->j_trans_max) * 9 / 10;
+		}
+	}
+
+	journal->j_default_max_commit_age = journal->j_max_commit_age;
+
+	if (commit_max_age != 0) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	}
+
+	reiserfs_info(p_s_sb, "journal params: device %s, size %u, "
+		      "journal first block %u, max trans len %u, max batch %u, "
+		      "max commit age %u, max trans age %u\n",
+		      bdevname(journal->j_dev_bd, b),
+		      SB_ONDISK_JOURNAL_SIZE(p_s_sb),
+		      SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb),
+		      journal->j_trans_max,
+		      journal->j_max_batch,
+		      journal->j_max_commit_age, journal->j_max_trans_age);
+
+	brelse(bhjh);
+
+	journal->j_list_bitmap_index = 0;
+	journal_list_init(p_s_sb);
+
+	memset(journal->j_list_hash_table, 0,
+	       JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *));
+
+	INIT_LIST_HEAD(&journal->j_dirty_buffers);
+	spin_lock_init(&journal->j_dirty_buffers_lock);
+
+	journal->j_start = 0;
+	journal->j_len = 0;
+	journal->j_len_alloc = 0;
+	atomic_set(&(journal->j_wcount), 0);
+	atomic_set(&(journal->j_async_throttle), 0);
+	journal->j_bcount = 0;
+	journal->j_trans_start_time = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	init_waitqueue_head(&(journal->j_join_wait));
+	sema_init(&journal->j_lock, 1);
+	sema_init(&journal->j_flush_sem, 1);
+
+	journal->j_trans_id = 10;
+	journal->j_mount_id = 10;
+	journal->j_state = 0;
+	atomic_set(&(journal->j_jlock), 0);
+	journal->j_cnode_free_list = allocate_cnodes(num_cnodes);
+	journal->j_cnode_free_orig = journal->j_cnode_free_list;
+	journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0;
+	journal->j_cnode_used = 0;
+	journal->j_must_wait = 0;
+
+	init_journal_hash(p_s_sb);
+	jl = journal->j_current_jl;
+	jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+	if (!jl->j_list_bitmap) {
+		reiserfs_warning(p_s_sb,
+				 "journal-2005, get_list_bitmap failed for journal list 0");
+		goto free_and_return;
+	}
+	if (journal_read(p_s_sb) < 0) {
+		reiserfs_warning(p_s_sb, "Replay Failure, unable to mount");
+		goto free_and_return;
+	}
+
+	reiserfs_mounted_fs_count++;
+	if (reiserfs_mounted_fs_count <= 1)
+		commit_wq = create_workqueue("reiserfs");
+
+	INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
+	return 0;
+      free_and_return:
+	free_journal_ram(p_s_sb);
+	return 1;
 }
 
 /*
@@ -2534,96 +2786,102 @@ free_and_return:
 ** be used by delete to make sure they don't write more than can fit inside a single
 ** transaction
 */
-int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
-  struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
-  time_t now = get_seconds() ;
-  /* cannot restart while nested */
-  BUG_ON (!th->t_trans_id);
-  if (th->t_refcount > 1)
-    return 0 ;
-  if ( journal->j_must_wait > 0 ||
-       (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
-       atomic_read(&(journal->j_jlock)) ||
-      (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
-       journal->j_cnode_free < (journal->j_trans_max * 3)) {
-    return 1 ;
-  }
-  return 0 ;
+int journal_transaction_should_end(struct reiserfs_transaction_handle *th,
+				   int new_alloc)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	time_t now = get_seconds();
+	/* cannot restart while nested */
+	BUG_ON(!th->t_trans_id);
+	if (th->t_refcount > 1)
+		return 0;
+	if (journal->j_must_wait > 0 ||
+	    (journal->j_len_alloc + new_alloc) >= journal->j_max_batch ||
+	    atomic_read(&(journal->j_jlock)) ||
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age ||
+	    journal->j_cnode_free < (journal->j_trans_max * 3)) {
+		return 1;
+	}
+	return 0;
 }
 
 /* this must be called inside a transaction, and requires the 
 ** kernel_lock to be held
 */
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th) {
-    struct reiserfs_journal *journal = SB_JOURNAL (th->t_super);
-    BUG_ON (!th->t_trans_id);
-    journal->j_must_wait = 1 ;
-    set_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
-    return ;
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(th->t_super);
+	BUG_ON(!th->t_trans_id);
+	journal->j_must_wait = 1;
+	set_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	return;
 }
 
 /* this must be called without a transaction started, and does not
 ** require BKL
 */
-void reiserfs_allow_writes(struct super_block *s) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    clear_bit(J_WRITERS_BLOCKED, &journal->j_state) ;
-    wake_up(&journal->j_join_wait) ;
+void reiserfs_allow_writes(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	clear_bit(J_WRITERS_BLOCKED, &journal->j_state);
+	wake_up(&journal->j_join_wait);
 }
 
 /* this must be called without a transaction started, and does not
 ** require BKL
 */
-void reiserfs_wait_on_write_block(struct super_block *s) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    wait_event(journal->j_join_wait,
-               !test_bit(J_WRITERS_BLOCKED, &journal->j_state)) ;
-}
-
-static void queue_log_writer(struct super_block *s) {
-    wait_queue_t wait;
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    set_bit(J_WRITERS_QUEUED, &journal->j_state);
-
-    /*
-     * we don't want to use wait_event here because
-     * we only want to wait once.
-     */
-    init_waitqueue_entry(&wait, current);
-    add_wait_queue(&journal->j_join_wait, &wait);
-    set_current_state(TASK_UNINTERRUPTIBLE);
-    if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
-        schedule();
-    current->state = TASK_RUNNING;
-    remove_wait_queue(&journal->j_join_wait, &wait);
-}
-
-static void wake_queued_writers(struct super_block *s) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
-        wake_up(&journal->j_join_wait);
-}
-
-static void let_transaction_grow(struct super_block *sb,
-                                 unsigned long trans_id)
-{
-    struct reiserfs_journal *journal = SB_JOURNAL (sb);
-    unsigned long bcount = journal->j_bcount;
-    while(1) {
+void reiserfs_wait_on_write_block(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	wait_event(journal->j_join_wait,
+		   !test_bit(J_WRITERS_BLOCKED, &journal->j_state));
+}
+
+static void queue_log_writer(struct super_block *s)
+{
+	wait_queue_t wait;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	set_bit(J_WRITERS_QUEUED, &journal->j_state);
+
+	/*
+	 * we don't want to use wait_event here because
+	 * we only want to wait once.
+	 */
+	init_waitqueue_entry(&wait, current);
+	add_wait_queue(&journal->j_join_wait, &wait);
 	set_current_state(TASK_UNINTERRUPTIBLE);
-	schedule_timeout(1);
-	journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
-        while ((atomic_read(&journal->j_wcount) > 0 ||
-	        atomic_read(&journal->j_jlock)) &&
-	       journal->j_trans_id == trans_id) {
-	    queue_log_writer(sb);
+	if (test_bit(J_WRITERS_QUEUED, &journal->j_state))
+		schedule();
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&journal->j_join_wait, &wait);
+}
+
+static void wake_queued_writers(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	if (test_and_clear_bit(J_WRITERS_QUEUED, &journal->j_state))
+		wake_up(&journal->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb, unsigned long trans_id)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	unsigned long bcount = journal->j_bcount;
+	while (1) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+		journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
+		while ((atomic_read(&journal->j_wcount) > 0 ||
+			atomic_read(&journal->j_jlock)) &&
+		       journal->j_trans_id == trans_id) {
+			queue_log_writer(sb);
+		}
+		if (journal->j_trans_id != trans_id)
+			break;
+		if (bcount == journal->j_bcount)
+			break;
+		bcount = journal->j_bcount;
 	}
-	if (journal->j_trans_id != trans_id)
-	    break;
-	if (bcount == journal->j_bcount)
-	    break;
-	bcount = journal->j_bcount;
-    }
 }
 
 /* join == true if you must join an existing transaction.
@@ -2632,224 +2890,244 @@ static void let_transaction_grow(struct super_block *sb,
 ** this will block until the transaction is joinable.  send the number of blocks you
 ** expect to use in nblocks.
 */
-static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
-  time_t now = get_seconds() ;
-  int old_trans_id  ;
-  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
-  struct reiserfs_transaction_handle myth;
-  int sched_count = 0;
-  int retval;
-
-  reiserfs_check_lock_depth(p_s_sb, "journal_begin") ;
-  if (nblocks > journal->j_trans_max)
-	BUG();
-
-  PROC_INFO_INC( p_s_sb, journal.journal_being );
-  /* set here for journal_join */
-  th->t_refcount = 1;
-  th->t_super = p_s_sb ;
-
-relock:
-  lock_journal(p_s_sb) ;
-  if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted (journal)) {
-    unlock_journal (p_s_sb);
-    retval = journal->j_errno;
-    goto out_fail;
-  }
-  journal->j_bcount++;
-
-  if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
-    unlock_journal(p_s_sb) ;
-    reiserfs_wait_on_write_block(p_s_sb) ;
-    PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
-    goto relock ;
-  }
-  now = get_seconds();
-
-  /* if there is no room in the journal OR
-  ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
-  ** we don't sleep if there aren't other writers
-  */
-
-  if ( (!join && journal->j_must_wait > 0) ||
-     ( !join && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch) ||
-     (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
-      (now - journal->j_trans_start_time) > journal->j_max_trans_age) ||
-     (!join && atomic_read(&journal->j_jlock)) ||
-     (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
-
-    old_trans_id = journal->j_trans_id;
-    unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
-
-    if (!join && (journal->j_len_alloc + nblocks + 2) >=
-        journal->j_max_batch &&
-	((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
-    {
-	if (atomic_read(&journal->j_wcount) > 10) {
-	    sched_count++;
-	    queue_log_writer(p_s_sb);
-	    goto relock;
-	}
-    }
-    /* don't mess with joining the transaction if all we have to do is
-     * wait for someone else to do a commit
-     */
-    if (atomic_read(&journal->j_jlock)) {
-	while (journal->j_trans_id == old_trans_id &&
-	       atomic_read(&journal->j_jlock)) {
-	    queue_log_writer(p_s_sb);
-        }
-	goto relock;
-    }
-    retval = journal_join(&myth, p_s_sb, 1) ;
-    if (retval)
-        goto out_fail;
-
-    /* someone might have ended the transaction while we joined */
-    if (old_trans_id != journal->j_trans_id) {
-        retval = do_journal_end(&myth, p_s_sb, 1, 0) ;
-    } else {
-        retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
-    }
-
-    if (retval)
-        goto out_fail;
-
-    PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
-    goto relock ;
-  }
-  /* we are the first writer, set trans_id */
-  if (journal->j_trans_start_time == 0) {
-    journal->j_trans_start_time = get_seconds();
-  }
-  atomic_inc(&(journal->j_wcount)) ;
-  journal->j_len_alloc += nblocks ;
-  th->t_blocks_logged = 0 ;
-  th->t_blocks_allocated = nblocks ;
-  th->t_trans_id = journal->j_trans_id ;
-  unlock_journal(p_s_sb) ;
-  INIT_LIST_HEAD (&th->t_list);
-  get_fs_excl();
-  return 0 ;
-
-out_fail:
-  memset (th, 0, sizeof (*th));
-  /* Re-set th->t_super, so we can properly keep track of how many
-   * persistent transactions there are. We need to do this so if this
-   * call is part of a failed restart_transaction, we can free it later */
-  th->t_super = p_s_sb;
-  return retval;
-}
-
-struct reiserfs_transaction_handle *
-reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
-    int ret ;
-    struct reiserfs_transaction_handle *th ;
-
-    /* if we're nesting into an existing transaction.  It will be
-    ** persistent on its own
-    */
-    if (reiserfs_transaction_running(s)) {
-        th = current->journal_info ;
-	th->t_refcount++ ;
-	if (th->t_refcount < 2) {
-	    BUG() ;
-	}
-	return th ;
-    }
-    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
-    if (!th)
-       return NULL;
-    ret = journal_begin(th, s, nblocks) ;
-    if (ret) {
-	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
-        return NULL;
-    }
-
-    SB_JOURNAL(s)->j_persistent_trans++;
-    return th ;
-}
-
-int
-reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
-    struct super_block *s = th->t_super;
-    int ret = 0;
-    if (th->t_trans_id)
-        ret = journal_end(th, th->t_super, th->t_blocks_allocated);
-    else
-        ret = -EIO;
-    if (th->t_refcount == 0) {
-        SB_JOURNAL(s)->j_persistent_trans--;
-	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
-    }
-    return ret;
-}
-
-static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-  /* this keeps do_journal_end from NULLing out the current->journal_info
-  ** pointer
-  */
-  th->t_handle_save = cur_th ;
-  if (cur_th && cur_th->t_refcount > 1) {
-      BUG() ;
-  }
-  return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN) ;
-}
-
-int journal_join_abort(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  struct reiserfs_transaction_handle *cur_th = current->journal_info;
-
-  /* this keeps do_journal_end from NULLing out the current->journal_info
-  ** pointer
-  */
-  th->t_handle_save = cur_th ;
-  if (cur_th && cur_th->t_refcount > 1) {
-      BUG() ;
-  }
-  return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT) ;
-}
-
-int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
-    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
-    int ret ;
-
-    th->t_handle_save = NULL ;
-    if (cur_th) {
-	/* we are nesting into the current transaction */
-	if (cur_th->t_super == p_s_sb) {
-              BUG_ON (!cur_th->t_refcount);
-	      cur_th->t_refcount++ ;
-	      memcpy(th, cur_th, sizeof(*th));
-	      if (th->t_refcount <= 1)
-		      reiserfs_warning (p_s_sb, "BAD: refcount <= 1, but journal_info != 0");
-	      return 0;
+static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
+			      struct super_block *p_s_sb, unsigned long nblocks,
+			      int join)
+{
+	time_t now = get_seconds();
+	int old_trans_id;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_transaction_handle myth;
+	int sched_count = 0;
+	int retval;
+
+	reiserfs_check_lock_depth(p_s_sb, "journal_begin");
+	if (nblocks > journal->j_trans_max)
+		BUG();
+
+	PROC_INFO_INC(p_s_sb, journal.journal_being);
+	/* set here for journal_join */
+	th->t_refcount = 1;
+	th->t_super = p_s_sb;
+
+      relock:
+	lock_journal(p_s_sb);
+	if (join != JBEGIN_ABORT && reiserfs_is_journal_aborted(journal)) {
+		unlock_journal(p_s_sb);
+		retval = journal->j_errno;
+		goto out_fail;
+	}
+	journal->j_bcount++;
+
+	if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
+		unlock_journal(p_s_sb);
+		reiserfs_wait_on_write_block(p_s_sb);
+		PROC_INFO_INC(p_s_sb, journal.journal_relock_writers);
+		goto relock;
+	}
+	now = get_seconds();
+
+	/* if there is no room in the journal OR
+	 ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
+	 ** we don't sleep if there aren't other writers
+	 */
+
+	if ((!join && journal->j_must_wait > 0) ||
+	    (!join
+	     && (journal->j_len_alloc + nblocks + 2) >= journal->j_max_batch)
+	    || (!join && atomic_read(&journal->j_wcount) > 0
+		&& journal->j_trans_start_time > 0
+		&& (now - journal->j_trans_start_time) >
+		journal->j_max_trans_age) || (!join
+					      && atomic_read(&journal->j_jlock))
+	    || (!join && journal->j_cnode_free < (journal->j_trans_max * 3))) {
+
+		old_trans_id = journal->j_trans_id;
+		unlock_journal(p_s_sb);	/* allow others to finish this transaction */
+
+		if (!join && (journal->j_len_alloc + nblocks + 2) >=
+		    journal->j_max_batch &&
+		    ((journal->j_len + nblocks + 2) * 100) <
+		    (journal->j_len_alloc * 75)) {
+			if (atomic_read(&journal->j_wcount) > 10) {
+				sched_count++;
+				queue_log_writer(p_s_sb);
+				goto relock;
+			}
+		}
+		/* don't mess with joining the transaction if all we have to do is
+		 * wait for someone else to do a commit
+		 */
+		if (atomic_read(&journal->j_jlock)) {
+			while (journal->j_trans_id == old_trans_id &&
+			       atomic_read(&journal->j_jlock)) {
+				queue_log_writer(p_s_sb);
+			}
+			goto relock;
+		}
+		retval = journal_join(&myth, p_s_sb, 1);
+		if (retval)
+			goto out_fail;
+
+		/* someone might have ended the transaction while we joined */
+		if (old_trans_id != journal->j_trans_id) {
+			retval = do_journal_end(&myth, p_s_sb, 1, 0);
+		} else {
+			retval = do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW);
+		}
+
+		if (retval)
+			goto out_fail;
+
+		PROC_INFO_INC(p_s_sb, journal.journal_relock_wcount);
+		goto relock;
+	}
+	/* we are the first writer, set trans_id */
+	if (journal->j_trans_start_time == 0) {
+		journal->j_trans_start_time = get_seconds();
+	}
+	atomic_inc(&(journal->j_wcount));
+	journal->j_len_alloc += nblocks;
+	th->t_blocks_logged = 0;
+	th->t_blocks_allocated = nblocks;
+	th->t_trans_id = journal->j_trans_id;
+	unlock_journal(p_s_sb);
+	INIT_LIST_HEAD(&th->t_list);
+	get_fs_excl();
+	return 0;
+
+      out_fail:
+	memset(th, 0, sizeof(*th));
+	/* Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later */
+	th->t_super = p_s_sb;
+	return retval;
+}
+
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+								    super_block
+								    *s,
+								    int nblocks)
+{
+	int ret;
+	struct reiserfs_transaction_handle *th;
+
+	/* if we're nesting into an existing transaction.  It will be
+	 ** persistent on its own
+	 */
+	if (reiserfs_transaction_running(s)) {
+		th = current->journal_info;
+		th->t_refcount++;
+		if (th->t_refcount < 2) {
+			BUG();
+		}
+		return th;
+	}
+	th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle),
+			      GFP_NOFS, s);
+	if (!th)
+		return NULL;
+	ret = journal_begin(th, s, nblocks);
+	if (ret) {
+		reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle),
+			       s);
+		return NULL;
+	}
+
+	SB_JOURNAL(s)->j_persistent_trans++;
+	return th;
+}
+
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th)
+{
+	struct super_block *s = th->t_super;
+	int ret = 0;
+	if (th->t_trans_id)
+		ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+	else
+		ret = -EIO;
+	if (th->t_refcount == 0) {
+		SB_JOURNAL(s)->j_persistent_trans--;
+		reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle),
+			       s);
+	}
+	return ret;
+}
+
+static int journal_join(struct reiserfs_transaction_handle *th,
+			struct super_block *p_s_sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/* this keeps do_journal_end from NULLing out the current->journal_info
+	 ** pointer
+	 */
+	th->t_handle_save = cur_th;
+	if (cur_th && cur_th->t_refcount > 1) {
+		BUG();
+	}
+	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_JOIN);
+}
+
+int journal_join_abort(struct reiserfs_transaction_handle *th,
+		       struct super_block *p_s_sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+	/* this keeps do_journal_end from NULLing out the current->journal_info
+	 ** pointer
+	 */
+	th->t_handle_save = cur_th;
+	if (cur_th && cur_th->t_refcount > 1) {
+		BUG();
+	}
+	return do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_ABORT);
+}
+
+int journal_begin(struct reiserfs_transaction_handle *th,
+		  struct super_block *p_s_sb, unsigned long nblocks)
+{
+	struct reiserfs_transaction_handle *cur_th = current->journal_info;
+	int ret;
+
+	th->t_handle_save = NULL;
+	if (cur_th) {
+		/* we are nesting into the current transaction */
+		if (cur_th->t_super == p_s_sb) {
+			BUG_ON(!cur_th->t_refcount);
+			cur_th->t_refcount++;
+			memcpy(th, cur_th, sizeof(*th));
+			if (th->t_refcount <= 1)
+				reiserfs_warning(p_s_sb,
+						 "BAD: refcount <= 1, but journal_info != 0");
+			return 0;
+		} else {
+			/* we've ended up with a handle from a different filesystem.
+			 ** save it and restore on journal_end.  This should never
+			 ** really happen...
+			 */
+			reiserfs_warning(p_s_sb,
+					 "clm-2100: nesting info a different FS");
+			th->t_handle_save = current->journal_info;
+			current->journal_info = th;
+		}
 	} else {
-	    /* we've ended up with a handle from a different filesystem.
-	    ** save it and restore on journal_end.  This should never
-	    ** really happen...
-	    */
-	    reiserfs_warning(p_s_sb, "clm-2100: nesting info a different FS") ;
-	    th->t_handle_save = current->journal_info ;
-	    current->journal_info = th;
-	}
-    } else {
-	current->journal_info = th;
-    }
-    ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG) ;
-    if (current->journal_info != th)
-        BUG() ;
+		current->journal_info = th;
+	}
+	ret = do_journal_begin_r(th, p_s_sb, nblocks, JBEGIN_REG);
+	if (current->journal_info != th)
+		BUG();
 
-    /* I guess this boils down to being the reciprocal of clm-2100 above.
-     * If do_journal_begin_r fails, we need to put it back, since journal_end
-     * won't be called to do it. */
-    if (ret)
-        current->journal_info = th->t_handle_save;
-    else
-        BUG_ON (!th->t_refcount);
+	/* I guess this boils down to being the reciprocal of clm-2100 above.
+	 * If do_journal_begin_r fails, we need to put it back, since journal_end
+	 * won't be called to do it. */
+	if (ret)
+		current->journal_info = th->t_handle_save;
+	else
+		BUG_ON(!th->t_refcount);
 
-    return ret ;
+	return ret;
 }
 
 /*
@@ -2861,129 +3139,140 @@ int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  *
 ** 
 ** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len.
 */
-int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_cnode *cn = NULL;
-  int count_already_incd = 0 ;
-  int prepared = 0 ;
-  BUG_ON (!th->t_trans_id);
-
-  PROC_INFO_INC( p_s_sb, journal.mark_dirty );
-  if (th->t_trans_id != journal->j_trans_id) {
-    reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
-                   th->t_trans_id, journal->j_trans_id);
-  }
-
-  p_s_sb->s_dirt = 1;
-
-  prepared = test_clear_buffer_journal_prepared (bh);
-  clear_buffer_journal_restore_dirty (bh);
-  /* already in this transaction, we are done */
-  if (buffer_journaled(bh)) {
-    PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
-    return 0 ;
-  }
-
-  /* this must be turned into a panic instead of a warning.  We can't allow
-  ** a dirty or journal_dirty or locked buffer to be logged, as some changes
-  ** could get to disk too early.  NOT GOOD.
-  */
-  if (!prepared || buffer_dirty(bh)) {
-    reiserfs_warning (p_s_sb, "journal-1777: buffer %llu bad state "
-		      "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
-		      (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!',
-			    buffer_locked(bh) ? ' ' : '!',
-			    buffer_dirty(bh) ? ' ' : '!',
-			    buffer_journal_dirty(bh) ? ' ' : '!') ;
-  }
-
-  if (atomic_read(&(journal->j_wcount)) <= 0) {
-    reiserfs_warning (p_s_sb, "journal-1409: journal_mark_dirty returning because j_wcount was %d", atomic_read(&(journal->j_wcount))) ;
-    return 1 ;
-  }
-  /* this error means I've screwed up, and we've overflowed the transaction.  
-  ** Nothing can be done here, except make the FS readonly or panic.
-  */ 
-  if (journal->j_len >= journal->j_trans_max) {
-    reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", journal->j_len) ;
-  }
-
-  if (buffer_journal_dirty(bh)) {
-    count_already_incd = 1 ;
-    PROC_INFO_INC( p_s_sb, journal.mark_dirty_notjournal );
-    clear_buffer_journal_dirty (bh);
-  }
-
-  if (journal->j_len > journal->j_len_alloc) {
-    journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT ;
-  }
-
-  set_buffer_journaled (bh);
-
-  /* now put this guy on the end */
-  if (!cn) {
-    cn = get_cnode(p_s_sb) ;
-    if (!cn) {
-      reiserfs_panic(p_s_sb, "get_cnode failed!\n"); 
-    }
-
-    if (th->t_blocks_logged == th->t_blocks_allocated) {
-      th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ;
-      journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT ;
-    }
-    th->t_blocks_logged++ ;
-    journal->j_len++ ;
-
-    cn->bh = bh ;
-    cn->blocknr = bh->b_blocknr ;
-    cn->sb = p_s_sb;
-    cn->jlist = NULL ;
-    insert_journal_hash(journal->j_hash_table, cn) ;
-    if (!count_already_incd) {
-      get_bh(bh) ;
-    }
-  }
-  cn->next = NULL ;
-  cn->prev = journal->j_last ;
-  cn->bh = bh ;
-  if (journal->j_last) {
-    journal->j_last->next = cn ;
-    journal->j_last = cn ;
-  } else {
-    journal->j_first = cn ;
-    journal->j_last = cn ;
-  }
-  return 0 ;
-}
-
-int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  if (!current->journal_info && th->t_refcount > 1)
-    reiserfs_warning (p_s_sb, "REISER-NESTING: th NULL, refcount %d",
-                      th->t_refcount);
-
-  if (!th->t_trans_id) {
-    WARN_ON (1);
-    return -EIO;
-  }
-
-  th->t_refcount--;
-  if (th->t_refcount > 0) {
-    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
-
-    /* we aren't allowed to close a nested transaction on a different
-    ** filesystem from the one in the task struct
-    */
-    if (cur_th->t_super != th->t_super)
-      BUG() ;
-
-    if (th != cur_th) {
-      memcpy(current->journal_info, th, sizeof(*th));
-      th->t_trans_id = 0;
-    }
-    return 0;
-  } else {
-    return do_journal_end(th, p_s_sb, nblocks, 0) ;
-  }
+int journal_mark_dirty(struct reiserfs_transaction_handle *th,
+		       struct super_block *p_s_sb, struct buffer_head *bh)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	int count_already_incd = 0;
+	int prepared = 0;
+	BUG_ON(!th->t_trans_id);
+
+	PROC_INFO_INC(p_s_sb, journal.mark_dirty);
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super,
+			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	p_s_sb->s_dirt = 1;
+
+	prepared = test_clear_buffer_journal_prepared(bh);
+	clear_buffer_journal_restore_dirty(bh);
+	/* already in this transaction, we are done */
+	if (buffer_journaled(bh)) {
+		PROC_INFO_INC(p_s_sb, journal.mark_dirty_already);
+		return 0;
+	}
+
+	/* this must be turned into a panic instead of a warning.  We can't allow
+	 ** a dirty or journal_dirty or locked buffer to be logged, as some changes
+	 ** could get to disk too early.  NOT GOOD.
+	 */
+	if (!prepared || buffer_dirty(bh)) {
+		reiserfs_warning(p_s_sb, "journal-1777: buffer %llu bad state "
+				 "%cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT",
+				 (unsigned long long)bh->b_blocknr,
+				 prepared ? ' ' : '!',
+				 buffer_locked(bh) ? ' ' : '!',
+				 buffer_dirty(bh) ? ' ' : '!',
+				 buffer_journal_dirty(bh) ? ' ' : '!');
+	}
+
+	if (atomic_read(&(journal->j_wcount)) <= 0) {
+		reiserfs_warning(p_s_sb,
+				 "journal-1409: journal_mark_dirty returning because j_wcount was %d",
+				 atomic_read(&(journal->j_wcount)));
+		return 1;
+	}
+	/* this error means I've screwed up, and we've overflowed the transaction.  
+	 ** Nothing can be done here, except make the FS readonly or panic.
+	 */
+	if (journal->j_len >= journal->j_trans_max) {
+		reiserfs_panic(th->t_super,
+			       "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n",
+			       journal->j_len);
+	}
+
+	if (buffer_journal_dirty(bh)) {
+		count_already_incd = 1;
+		PROC_INFO_INC(p_s_sb, journal.mark_dirty_notjournal);
+		clear_buffer_journal_dirty(bh);
+	}
+
+	if (journal->j_len > journal->j_len_alloc) {
+		journal->j_len_alloc = journal->j_len + JOURNAL_PER_BALANCE_CNT;
+	}
+
+	set_buffer_journaled(bh);
+
+	/* now put this guy on the end */
+	if (!cn) {
+		cn = get_cnode(p_s_sb);
+		if (!cn) {
+			reiserfs_panic(p_s_sb, "get_cnode failed!\n");
+		}
+
+		if (th->t_blocks_logged == th->t_blocks_allocated) {
+			th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT;
+			journal->j_len_alloc += JOURNAL_PER_BALANCE_CNT;
+		}
+		th->t_blocks_logged++;
+		journal->j_len++;
+
+		cn->bh = bh;
+		cn->blocknr = bh->b_blocknr;
+		cn->sb = p_s_sb;
+		cn->jlist = NULL;
+		insert_journal_hash(journal->j_hash_table, cn);
+		if (!count_already_incd) {
+			get_bh(bh);
+		}
+	}
+	cn->next = NULL;
+	cn->prev = journal->j_last;
+	cn->bh = bh;
+	if (journal->j_last) {
+		journal->j_last->next = cn;
+		journal->j_last = cn;
+	} else {
+		journal->j_first = cn;
+		journal->j_last = cn;
+	}
+	return 0;
+}
+
+int journal_end(struct reiserfs_transaction_handle *th,
+		struct super_block *p_s_sb, unsigned long nblocks)
+{
+	if (!current->journal_info && th->t_refcount > 1)
+		reiserfs_warning(p_s_sb, "REISER-NESTING: th NULL, refcount %d",
+				 th->t_refcount);
+
+	if (!th->t_trans_id) {
+		WARN_ON(1);
+		return -EIO;
+	}
+
+	th->t_refcount--;
+	if (th->t_refcount > 0) {
+		struct reiserfs_transaction_handle *cur_th =
+		    current->journal_info;
+
+		/* we aren't allowed to close a nested transaction on a different
+		 ** filesystem from the one in the task struct
+		 */
+		if (cur_th->t_super != th->t_super)
+			BUG();
+
+		if (th != cur_th) {
+			memcpy(current->journal_info, th, sizeof(*th));
+			th->t_trans_id = 0;
+		}
+		return 0;
+	} else {
+		return do_journal_end(th, p_s_sb, nblocks, 0);
+	}
 }
 
 /* removes from the current transaction, relsing and descrementing any counters.  
@@ -2993,47 +3282,51 @@ int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_
 **
 ** returns 1 if it cleaned and relsed the buffer. 0 otherwise
 */
-static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t blocknr, int already_cleaned) {
-  struct buffer_head *bh ;
-  struct reiserfs_journal_cnode *cn ;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  int ret = 0;
-
-  cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr) ;
-  if (!cn || !cn->bh) {
-    return ret ;
-  }
-  bh = cn->bh ;
-  if (cn->prev) {
-    cn->prev->next = cn->next ;
-  }
-  if (cn->next) {
-    cn->next->prev = cn->prev ;
-  }
-  if (cn == journal->j_first) {
-    journal->j_first = cn->next ;
-  }
-  if (cn == journal->j_last) {
-    journal->j_last = cn->prev ;
-  }
-  if (bh)
-	remove_journal_hash(p_s_sb, journal->j_hash_table, NULL, bh->b_blocknr, 0) ;
-  clear_buffer_journaled  (bh); /* don't log this one */
-
-  if (!already_cleaned) {
-    clear_buffer_journal_dirty (bh);
-    clear_buffer_dirty(bh);
-    clear_buffer_journal_test (bh);
-    put_bh(bh) ;
-    if (atomic_read(&(bh->b_count)) < 0) {
-      reiserfs_warning (p_s_sb, "journal-1752: remove from trans, b_count < 0");
-    }
-    ret = 1 ;
-  }
-  journal->j_len-- ;
-  journal->j_len_alloc-- ;
-  free_cnode(p_s_sb, cn) ;
-  return ret ;
+static int remove_from_transaction(struct super_block *p_s_sb,
+				   b_blocknr_t blocknr, int already_cleaned)
+{
+	struct buffer_head *bh;
+	struct reiserfs_journal_cnode *cn;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	int ret = 0;
+
+	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	if (!cn || !cn->bh) {
+		return ret;
+	}
+	bh = cn->bh;
+	if (cn->prev) {
+		cn->prev->next = cn->next;
+	}
+	if (cn->next) {
+		cn->next->prev = cn->prev;
+	}
+	if (cn == journal->j_first) {
+		journal->j_first = cn->next;
+	}
+	if (cn == journal->j_last) {
+		journal->j_last = cn->prev;
+	}
+	if (bh)
+		remove_journal_hash(p_s_sb, journal->j_hash_table, NULL,
+				    bh->b_blocknr, 0);
+	clear_buffer_journaled(bh);	/* don't log this one */
+
+	if (!already_cleaned) {
+		clear_buffer_journal_dirty(bh);
+		clear_buffer_dirty(bh);
+		clear_buffer_journal_test(bh);
+		put_bh(bh);
+		if (atomic_read(&(bh->b_count)) < 0) {
+			reiserfs_warning(p_s_sb,
+					 "journal-1752: remove from trans, b_count < 0");
+		}
+		ret = 1;
+	}
+	journal->j_len--;
+	journal->j_len_alloc--;
+	free_cnode(p_s_sb, cn);
+	return ret;
 }
 
 /*
@@ -3046,120 +3339,129 @@ static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t block
 ** blocks for a given transaction on disk
 **
 */
-static int can_dirty(struct reiserfs_journal_cnode *cn) {
-  struct super_block *sb = cn->sb;
-  b_blocknr_t blocknr = cn->blocknr  ;
-  struct reiserfs_journal_cnode *cur = cn->hprev ;
-  int can_dirty = 1 ;
-  
-  /* first test hprev.  These are all newer than cn, so any node here
-  ** with the same block number and dev means this node can't be sent
-  ** to disk right now.
-  */
-  while(cur && can_dirty) {
-    if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb && 
-        cur->blocknr == blocknr) {
-      can_dirty = 0 ;
-    }
-    cur = cur->hprev ;
-  }
-  /* then test hnext.  These are all older than cn.  As long as they
-  ** are committed to the log, it is safe to write cn to disk
-  */
-  cur = cn->hnext ;
-  while(cur && can_dirty) {
-    if (cur->jlist && cur->jlist->j_len > 0 && 
-        atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && 
-        cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
-      can_dirty = 0 ;
-    }
-    cur = cur->hnext ;
-  }
-  return can_dirty ;
+static int can_dirty(struct reiserfs_journal_cnode *cn)
+{
+	struct super_block *sb = cn->sb;
+	b_blocknr_t blocknr = cn->blocknr;
+	struct reiserfs_journal_cnode *cur = cn->hprev;
+	int can_dirty = 1;
+
+	/* first test hprev.  These are all newer than cn, so any node here
+	 ** with the same block number and dev means this node can't be sent
+	 ** to disk right now.
+	 */
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->bh && cur->blocknr && cur->sb == sb &&
+		    cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hprev;
+	}
+	/* then test hnext.  These are all older than cn.  As long as they
+	 ** are committed to the log, it is safe to write cn to disk
+	 */
+	cur = cn->hnext;
+	while (cur && can_dirty) {
+		if (cur->jlist && cur->jlist->j_len > 0 &&
+		    atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh &&
+		    cur->blocknr && cur->sb == sb && cur->blocknr == blocknr) {
+			can_dirty = 0;
+		}
+		cur = cur->hnext;
+	}
+	return can_dirty;
 }
 
 /* syncs the commit blocks, but does not force the real buffers to disk
 ** will wait until the current transaction is done/commited before returning 
 */
-int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
+int journal_end_sync(struct reiserfs_transaction_handle *th,
+		     struct super_block *p_s_sb, unsigned long nblocks)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
 
-  BUG_ON (!th->t_trans_id);
-  /* you can sync while nested, very, very bad */
-  if (th->t_refcount > 1) {
-    BUG() ;
-  }
-  if (journal->j_len == 0) {
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-  }
-  return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
+	BUG_ON(!th->t_trans_id);
+	/* you can sync while nested, very, very bad */
+	if (th->t_refcount > 1) {
+		BUG();
+	}
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+					     1);
+		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+	}
+	return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT);
 }
 
 /*
 ** writeback the pending async commits to disk
 */
-static void flush_async_commits(void *p) {
-  struct super_block *p_s_sb = p;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_list *jl;
-  struct list_head *entry;
-
-  lock_kernel();
-  if (!list_empty(&journal->j_journal_list)) {
-      /* last entry is the youngest, commit it and you get everything */
-      entry = journal->j_journal_list.prev;
-      jl = JOURNAL_LIST_ENTRY(entry);
-      flush_commit_list(p_s_sb, jl, 1);
-  }
-  unlock_kernel();
-  /*
-   * this is a little racey, but there's no harm in missing
-   * the filemap_fdata_write
-   */
-  if (!atomic_read(&journal->j_async_throttle) && !reiserfs_is_journal_aborted (journal)) {
-      atomic_inc(&journal->j_async_throttle);
-      filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
-      atomic_dec(&journal->j_async_throttle);
-  }
+static void flush_async_commits(void *p)
+{
+	struct super_block *p_s_sb = p;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_list *jl;
+	struct list_head *entry;
+
+	lock_kernel();
+	if (!list_empty(&journal->j_journal_list)) {
+		/* last entry is the youngest, commit it and you get everything */
+		entry = journal->j_journal_list.prev;
+		jl = JOURNAL_LIST_ENTRY(entry);
+		flush_commit_list(p_s_sb, jl, 1);
+	}
+	unlock_kernel();
+	/*
+	 * this is a little racey, but there's no harm in missing
+	 * the filemap_fdata_write
+	 */
+	if (!atomic_read(&journal->j_async_throttle)
+	    && !reiserfs_is_journal_aborted(journal)) {
+		atomic_inc(&journal->j_async_throttle);
+		filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+		atomic_dec(&journal->j_async_throttle);
+	}
 }
 
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 */
-int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
-    time_t now ;
-    struct reiserfs_transaction_handle th ;
-    struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-
-    now = get_seconds();
-    /* safety check so we don't flush while we are replaying the log during
-     * mount
-     */
-    if (list_empty(&journal->j_journal_list)) {
-	return 0  ;
-    }
-
-    /* check the current transaction.  If there are no writers, and it is
-     * too old, finish it, and force the commit blocks to disk
-     */
-    if (atomic_read(&journal->j_wcount) <= 0 &&
-        journal->j_trans_start_time > 0 &&
-        journal->j_len > 0 &&
-        (now - journal->j_trans_start_time) > journal->j_max_trans_age)
-    {
-	if (!journal_join(&th, p_s_sb, 1)) {
-            reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-            journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-
-            /* we're only being called from kreiserfsd, it makes no sense to do
-            ** an async commit so that kreiserfsd can do it later
-            */
-            do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-        }
-    }
-    return p_s_sb->s_dirt;
+int reiserfs_flush_old_commits(struct super_block *p_s_sb)
+{
+	time_t now;
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+
+	now = get_seconds();
+	/* safety check so we don't flush while we are replaying the log during
+	 * mount
+	 */
+	if (list_empty(&journal->j_journal_list)) {
+		return 0;
+	}
+
+	/* check the current transaction.  If there are no writers, and it is
+	 * too old, finish it, and force the commit blocks to disk
+	 */
+	if (atomic_read(&journal->j_wcount) <= 0 &&
+	    journal->j_trans_start_time > 0 &&
+	    journal->j_len > 0 &&
+	    (now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		if (!journal_join(&th, p_s_sb, 1)) {
+			reiserfs_prepare_for_journal(p_s_sb,
+						     SB_BUFFER_WITH_SB(p_s_sb),
+						     1);
+			journal_mark_dirty(&th, p_s_sb,
+					   SB_BUFFER_WITH_SB(p_s_sb));
+
+			/* we're only being called from kreiserfsd, it makes no sense to do
+			 ** an async commit so that kreiserfsd can do it later
+			 */
+			do_journal_end(&th, p_s_sb, 1, COMMIT_NOW | WAIT);
+		}
+	}
+	return p_s_sb->s_dirt;
 }
 
 /*
@@ -3173,101 +3475,108 @@ int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
 ** 
 ** Note, we can't allow the journal_end to proceed while there are still writers in the log.
 */
-static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, 
-                             unsigned long nblocks, int flags) {
-
-  time_t now ;
-  int flush = flags & FLUSH_ALL ;
-  int commit_now = flags & COMMIT_NOW ;
-  int wait_on_commit = flags & WAIT ;
-  struct reiserfs_journal_list *jl;
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-
-  BUG_ON (!th->t_trans_id);
-
-  if (th->t_trans_id != journal->j_trans_id) {
-    reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
-                   th->t_trans_id, journal->j_trans_id);
-  }
-
-  journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ;
-  if (atomic_read(&(journal->j_wcount)) > 0) { /* <= 0 is allowed.  unmounting might not call begin */
-    atomic_dec(&(journal->j_wcount)) ;
-  }
-
-  /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released 
-  ** will be dealt with by next transaction that actually writes something, but should be taken
-  ** care of in this trans
-  */
-  if (journal->j_len == 0) {
-    BUG();
-  }
-  /* if wcount > 0, and we are called to with flush or commit_now,
-  ** we wait on j_join_wait.  We will wake up when the last writer has
-  ** finished the transaction, and started it on its way to the disk.
-  ** Then, we flush the commit or journal list, and just return 0 
-  ** because the rest of journal end was already done for this transaction.
-  */
-  if (atomic_read(&(journal->j_wcount)) > 0) {
-    if (flush || commit_now) {
-      unsigned trans_id ;
-
-      jl = journal->j_current_jl;
-      trans_id = jl->j_trans_id;
-      if (wait_on_commit)
-        jl->j_state |= LIST_COMMIT_PENDING;
-      atomic_set(&(journal->j_jlock), 1) ;
-      if (flush) {
-        journal->j_next_full_flush = 1 ;
-      }
-      unlock_journal(p_s_sb) ;
-
-      /* sleep while the current transaction is still j_jlocked */
-      while(journal->j_trans_id == trans_id) {
-	if (atomic_read(&journal->j_jlock)) {
-	    queue_log_writer(p_s_sb);
-        } else {
-	    lock_journal(p_s_sb);
-	    if (journal->j_trans_id == trans_id) {
-	        atomic_set(&(journal->j_jlock), 1) ;
-	    }
-	    unlock_journal(p_s_sb);
-	}
-      }
-      if (journal->j_trans_id == trans_id) {
-          BUG();
-      }
-      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
-          wait_on_commit)
-      {
-	  flush_commit_list(p_s_sb, jl, 1) ;
-      }
-      return 0 ;
-    } 
-    unlock_journal(p_s_sb) ;
-    return 0 ;
-  }
-
-  /* deal with old transactions where we are the last writers */
-  now = get_seconds();
-  if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
-    commit_now = 1 ;
-    journal->j_next_async_flush = 1 ;
-  }
-  /* don't batch when someone is waiting on j_join_wait */
-  /* don't batch when syncing the commit or flushing the whole trans */
-  if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock))) && !flush && !commit_now &&
-      (journal->j_len < journal->j_max_batch)  &&
-      journal->j_len_alloc < journal->j_max_batch && journal->j_cnode_free > (journal->j_trans_max * 3)) {
-    journal->j_bcount++ ;
-    unlock_journal(p_s_sb) ;
-    return 0 ;
-  }
-
-  if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-    reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%ld) is too high\n", journal->j_start) ;
-  }
-  return 1 ;
+static int check_journal_end(struct reiserfs_transaction_handle *th,
+			     struct super_block *p_s_sb, unsigned long nblocks,
+			     int flags)
+{
+
+	time_t now;
+	int flush = flags & FLUSH_ALL;
+	int commit_now = flags & COMMIT_NOW;
+	int wait_on_commit = flags & WAIT;
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+
+	BUG_ON(!th->t_trans_id);
+
+	if (th->t_trans_id != journal->j_trans_id) {
+		reiserfs_panic(th->t_super,
+			       "journal-1577: handle trans id %ld != current trans id %ld\n",
+			       th->t_trans_id, journal->j_trans_id);
+	}
+
+	journal->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged);
+	if (atomic_read(&(journal->j_wcount)) > 0) {	/* <= 0 is allowed.  unmounting might not call begin */
+		atomic_dec(&(journal->j_wcount));
+	}
+
+	/* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released 
+	 ** will be dealt with by next transaction that actually writes something, but should be taken
+	 ** care of in this trans
+	 */
+	if (journal->j_len == 0) {
+		BUG();
+	}
+	/* if wcount > 0, and we are called to with flush or commit_now,
+	 ** we wait on j_join_wait.  We will wake up when the last writer has
+	 ** finished the transaction, and started it on its way to the disk.
+	 ** Then, we flush the commit or journal list, and just return 0 
+	 ** because the rest of journal end was already done for this transaction.
+	 */
+	if (atomic_read(&(journal->j_wcount)) > 0) {
+		if (flush || commit_now) {
+			unsigned trans_id;
+
+			jl = journal->j_current_jl;
+			trans_id = jl->j_trans_id;
+			if (wait_on_commit)
+				jl->j_state |= LIST_COMMIT_PENDING;
+			atomic_set(&(journal->j_jlock), 1);
+			if (flush) {
+				journal->j_next_full_flush = 1;
+			}
+			unlock_journal(p_s_sb);
+
+			/* sleep while the current transaction is still j_jlocked */
+			while (journal->j_trans_id == trans_id) {
+				if (atomic_read(&journal->j_jlock)) {
+					queue_log_writer(p_s_sb);
+				} else {
+					lock_journal(p_s_sb);
+					if (journal->j_trans_id == trans_id) {
+						atomic_set(&(journal->j_jlock),
+							   1);
+					}
+					unlock_journal(p_s_sb);
+				}
+			}
+			if (journal->j_trans_id == trans_id) {
+				BUG();
+			}
+			if (commit_now
+			    && journal_list_still_alive(p_s_sb, trans_id)
+			    && wait_on_commit) {
+				flush_commit_list(p_s_sb, jl, 1);
+			}
+			return 0;
+		}
+		unlock_journal(p_s_sb);
+		return 0;
+	}
+
+	/* deal with old transactions where we are the last writers */
+	now = get_seconds();
+	if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) {
+		commit_now = 1;
+		journal->j_next_async_flush = 1;
+	}
+	/* don't batch when someone is waiting on j_join_wait */
+	/* don't batch when syncing the commit or flushing the whole trans */
+	if (!(journal->j_must_wait > 0) && !(atomic_read(&(journal->j_jlock)))
+	    && !flush && !commit_now && (journal->j_len < journal->j_max_batch)
+	    && journal->j_len_alloc < journal->j_max_batch
+	    && journal->j_cnode_free > (journal->j_trans_max * 3)) {
+		journal->j_bcount++;
+		unlock_journal(p_s_sb);
+		return 0;
+	}
+
+	if (journal->j_start > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+		reiserfs_panic(p_s_sb,
+			       "journal-003: journal_end: j_start (%ld) is too high\n",
+			       journal->j_start);
+	}
+	return 1;
 }
 
 /*
@@ -3284,83 +3593,95 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
 **
 ** Then remove it from the current transaction, decrementing any counters and filing it on the clean list.
 */
-int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, b_blocknr_t blocknr) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_cnode *cn = NULL ;
-  struct buffer_head *bh = NULL ;
-  struct reiserfs_list_bitmap *jb = NULL ;
-  int cleaned = 0 ;
-  BUG_ON (!th->t_trans_id);
-
-  cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
-  if (cn && cn->bh) {
-      bh = cn->bh ;
-      get_bh(bh) ;
-  }
-  /* if it is journal new, we just remove it from this transaction */
-  if (bh && buffer_journal_new(bh)) {
-    clear_buffer_journal_new (bh);
-    clear_prepared_bits(bh) ;
-    reiserfs_clean_and_file_buffer(bh) ;
-    cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
-  } else {
-    /* set the bit for this block in the journal bitmap for this transaction */
-    jb = journal->j_current_jl->j_list_bitmap;
-    if (!jb) {
-      reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
-    }
-    set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ;
-
-    /* Note, the entire while loop is not allowed to schedule.  */
-
-    if (bh) {
-      clear_prepared_bits(bh) ;
-      reiserfs_clean_and_file_buffer(bh) ;
-    }
-    cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
-
-    /* find all older transactions with this block, make sure they don't try to write it out */
-    cn = get_journal_hash_dev(p_s_sb,journal->j_list_hash_table,  blocknr) ;
-    while (cn) {
-      if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
-	set_bit(BLOCK_FREED, &cn->state) ;
-	if (cn->bh) {
-	  if (!cleaned) {
-	    /* remove_from_transaction will brelse the buffer if it was 
-	    ** in the current trans
-	    */
-            clear_buffer_journal_dirty (cn->bh);
-	    clear_buffer_dirty(cn->bh);
-	    clear_buffer_journal_test(cn->bh);
-	    cleaned = 1 ;
-	    put_bh(cn->bh) ;
-	    if (atomic_read(&(cn->bh->b_count)) < 0) {
-	      reiserfs_warning (p_s_sb, "journal-2138: cn->bh->b_count < 0");
-	    }
-	  }
-	  if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */
-	    atomic_dec(&(cn->jlist->j_nonzerolen)) ;
-	  }
-	  cn->bh = NULL ; 
-	} 
-      }
-      cn = cn->hnext ;
-    }
-  }
-
-  if (bh) {
-    put_bh(bh) ; /* get_hash grabs the buffer */
-    if (atomic_read(&(bh->b_count)) < 0) {
-      reiserfs_warning (p_s_sb, "journal-2165: bh->b_count < 0");
-    }
-  }
-  return 0 ;
-}
-
-void reiserfs_update_inode_transaction(struct inode *inode) {
-  struct reiserfs_journal *journal = SB_JOURNAL (inode->i_sb);
-  REISERFS_I(inode)->i_jl = journal->j_current_jl;
-  REISERFS_I(inode)->i_trans_id = journal->j_trans_id ;
+int journal_mark_freed(struct reiserfs_transaction_handle *th,
+		       struct super_block *p_s_sb, b_blocknr_t blocknr)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_cnode *cn = NULL;
+	struct buffer_head *bh = NULL;
+	struct reiserfs_list_bitmap *jb = NULL;
+	int cleaned = 0;
+	BUG_ON(!th->t_trans_id);
+
+	cn = get_journal_hash_dev(p_s_sb, journal->j_hash_table, blocknr);
+	if (cn && cn->bh) {
+		bh = cn->bh;
+		get_bh(bh);
+	}
+	/* if it is journal new, we just remove it from this transaction */
+	if (bh && buffer_journal_new(bh)) {
+		clear_buffer_journal_new(bh);
+		clear_prepared_bits(bh);
+		reiserfs_clean_and_file_buffer(bh);
+		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+	} else {
+		/* set the bit for this block in the journal bitmap for this transaction */
+		jb = journal->j_current_jl->j_list_bitmap;
+		if (!jb) {
+			reiserfs_panic(p_s_sb,
+				       "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n");
+		}
+		set_bit_in_list_bitmap(p_s_sb, blocknr, jb);
+
+		/* Note, the entire while loop is not allowed to schedule.  */
+
+		if (bh) {
+			clear_prepared_bits(bh);
+			reiserfs_clean_and_file_buffer(bh);
+		}
+		cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned);
+
+		/* find all older transactions with this block, make sure they don't try to write it out */
+		cn = get_journal_hash_dev(p_s_sb, journal->j_list_hash_table,
+					  blocknr);
+		while (cn) {
+			if (p_s_sb == cn->sb && blocknr == cn->blocknr) {
+				set_bit(BLOCK_FREED, &cn->state);
+				if (cn->bh) {
+					if (!cleaned) {
+						/* remove_from_transaction will brelse the buffer if it was 
+						 ** in the current trans
+						 */
+						clear_buffer_journal_dirty(cn->
+									   bh);
+						clear_buffer_dirty(cn->bh);
+						clear_buffer_journal_test(cn->
+									  bh);
+						cleaned = 1;
+						put_bh(cn->bh);
+						if (atomic_read
+						    (&(cn->bh->b_count)) < 0) {
+							reiserfs_warning(p_s_sb,
+									 "journal-2138: cn->bh->b_count < 0");
+						}
+					}
+					if (cn->jlist) {	/* since we are clearing the bh, we MUST dec nonzerolen */
+						atomic_dec(&
+							   (cn->jlist->
+							    j_nonzerolen));
+					}
+					cn->bh = NULL;
+				}
+			}
+			cn = cn->hnext;
+		}
+	}
+
+	if (bh) {
+		put_bh(bh);	/* get_hash grabs the buffer */
+		if (atomic_read(&(bh->b_count)) < 0) {
+			reiserfs_warning(p_s_sb,
+					 "journal-2165: bh->b_count < 0");
+		}
+	}
+	return 0;
+}
+
+void reiserfs_update_inode_transaction(struct inode *inode)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(inode->i_sb);
+	REISERFS_I(inode)->i_jl = journal->j_current_jl;
+	REISERFS_I(inode)->i_trans_id = journal->j_trans_id;
 }
 
 /*
@@ -3368,99 +3689,102 @@ void reiserfs_update_inode_transaction(struct inode *inode) {
  * if a transaction was actually committed and the barrier was done
  */
 static int __commit_trans_jl(struct inode *inode, unsigned long id,
-                                 struct reiserfs_journal_list *jl)
+			     struct reiserfs_journal_list *jl)
 {
-    struct reiserfs_transaction_handle th ;
-    struct super_block *sb = inode->i_sb ;
-    struct reiserfs_journal *journal = SB_JOURNAL (sb);
-    int ret = 0;
+	struct reiserfs_transaction_handle th;
+	struct super_block *sb = inode->i_sb;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	int ret = 0;
+
+	/* is it from the current transaction, or from an unknown transaction? */
+	if (id == journal->j_trans_id) {
+		jl = journal->j_current_jl;
+		/* try to let other writers come in and grow this transaction */
+		let_transaction_grow(sb, id);
+		if (journal->j_trans_id != id) {
+			goto flush_commit_only;
+		}
 
-    /* is it from the current transaction, or from an unknown transaction? */
-    if (id == journal->j_trans_id) {
-	jl = journal->j_current_jl;
-	/* try to let other writers come in and grow this transaction */
-	let_transaction_grow(sb, id);
-	if (journal->j_trans_id != id) {
-	    goto flush_commit_only;
-	}
+		ret = journal_begin(&th, sb, 1);
+		if (ret)
+			return ret;
+
+		/* someone might have ended this transaction while we joined */
+		if (journal->j_trans_id != id) {
+			reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb),
+						     1);
+			journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb));
+			ret = journal_end(&th, sb, 1);
+			goto flush_commit_only;
+		}
 
-	ret = journal_begin(&th, sb, 1) ;
-	if (ret)
-	    return ret;
+		ret = journal_end_sync(&th, sb, 1);
+		if (!ret)
+			ret = 1;
 
-	/* someone might have ended this transaction while we joined */
-	if (journal->j_trans_id != id) {
-	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
-	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
-	    ret = journal_end(&th, sb, 1) ;
-	    goto flush_commit_only;
+	} else {
+		/* this gets tricky, we have to make sure the journal list in
+		 * the inode still exists.  We know the list is still around
+		 * if we've got a larger transaction id than the oldest list
+		 */
+	      flush_commit_only:
+		if (journal_list_still_alive(inode->i_sb, id)) {
+			/*
+			 * we only set ret to 1 when we know for sure
+			 * the barrier hasn't been started yet on the commit
+			 * block.
+			 */
+			if (atomic_read(&jl->j_commit_left) > 1)
+				ret = 1;
+			flush_commit_list(sb, jl, 1);
+			if (journal->j_errno)
+				ret = journal->j_errno;
+		}
 	}
+	/* otherwise the list is gone, and long since committed */
+	return ret;
+}
 
-	ret = journal_end_sync(&th, sb, 1) ;
-	if (!ret)
-	    ret = 1;
+int reiserfs_commit_for_inode(struct inode *inode)
+{
+	unsigned long id = REISERFS_I(inode)->i_trans_id;
+	struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
 
-    } else {
-	/* this gets tricky, we have to make sure the journal list in
-	 * the inode still exists.  We know the list is still around
-	 * if we've got a larger transaction id than the oldest list
+	/* for the whole inode, assume unset id means it was
+	 * changed in the current transaction.  More conservative
 	 */
-flush_commit_only:
-	if (journal_list_still_alive(inode->i_sb, id)) {
-	    /*
-	     * we only set ret to 1 when we know for sure
-	     * the barrier hasn't been started yet on the commit
-	     * block.
-	     */
-	    if (atomic_read(&jl->j_commit_left) > 1)
-	        ret = 1;
-	    flush_commit_list(sb, jl, 1) ;
-	    if (journal->j_errno)
-		ret = journal->j_errno;
-	}
-    }
-    /* otherwise the list is gone, and long since committed */
-    return ret;
-}
-
-int reiserfs_commit_for_inode(struct inode *inode) {
-    unsigned long id = REISERFS_I(inode)->i_trans_id;
-    struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
-
-    /* for the whole inode, assume unset id means it was
-     * changed in the current transaction.  More conservative
-     */
-    if (!id || !jl) {
-	reiserfs_update_inode_transaction(inode) ;
-	id = REISERFS_I(inode)->i_trans_id;
-	/* jl will be updated in __commit_trans_jl */
-    }
-
-   return __commit_trans_jl(inode, id, jl);
-}
-
-void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
-                                      struct buffer_head *bh) {
-    struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-    PROC_INFO_INC( p_s_sb, journal.restore_prepared );
-    if (!bh) {
-	return ;
-    }
-    if (test_clear_buffer_journal_restore_dirty (bh) &&
-	buffer_journal_dirty(bh)) {
-	struct reiserfs_journal_cnode *cn;
-	cn = get_journal_hash_dev(p_s_sb,
-	                          journal->j_list_hash_table,
-				  bh->b_blocknr);
-	if (cn && can_dirty(cn)) {
-            set_buffer_journal_test (bh);
-	    mark_buffer_dirty(bh);
-        }
-    }
-    clear_buffer_journal_prepared (bh);
-}
-
-extern struct tree_balance *cur_tb ;
+	if (!id || !jl) {
+		reiserfs_update_inode_transaction(inode);
+		id = REISERFS_I(inode)->i_trans_id;
+		/* jl will be updated in __commit_trans_jl */
+	}
+
+	return __commit_trans_jl(inode, id, jl);
+}
+
+void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb,
+				      struct buffer_head *bh)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	PROC_INFO_INC(p_s_sb, journal.restore_prepared);
+	if (!bh) {
+		return;
+	}
+	if (test_clear_buffer_journal_restore_dirty(bh) &&
+	    buffer_journal_dirty(bh)) {
+		struct reiserfs_journal_cnode *cn;
+		cn = get_journal_hash_dev(p_s_sb,
+					  journal->j_list_hash_table,
+					  bh->b_blocknr);
+		if (cn && can_dirty(cn)) {
+			set_buffer_journal_test(bh);
+			mark_buffer_dirty(bh);
+		}
+	}
+	clear_buffer_journal_prepared(bh);
+}
+
+extern struct tree_balance *cur_tb;
 /*
 ** before we can change a metadata block, we have to make sure it won't
 ** be written to disk while we are altering it.  So, we must:
@@ -3469,39 +3793,41 @@ extern struct tree_balance *cur_tb ;
 ** 
 */
 int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
-                                  struct buffer_head *bh, int wait) {
-  PROC_INFO_INC( p_s_sb, journal.prepare );
-
-    if (test_set_buffer_locked(bh)) {
-	if (!wait)
-	    return 0;
-	lock_buffer(bh);
-    }
-    set_buffer_journal_prepared (bh);
-    if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh))  {
-        clear_buffer_journal_test (bh);
-        set_buffer_journal_restore_dirty (bh);
-    }
-    unlock_buffer(bh);
-    return 1;
-}
-
-static void flush_old_journal_lists(struct super_block *s) {
-    struct reiserfs_journal *journal = SB_JOURNAL (s);
-    struct reiserfs_journal_list *jl;
-    struct list_head *entry;
-    time_t now = get_seconds();
-
-    while(!list_empty(&journal->j_journal_list)) {
-        entry = journal->j_journal_list.next;
-	jl = JOURNAL_LIST_ENTRY(entry);
-	/* this check should always be run, to send old lists to disk */
-	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
-	    flush_used_journal_lists(s, jl);
-	} else {
-	    break;
+				 struct buffer_head *bh, int wait)
+{
+	PROC_INFO_INC(p_s_sb, journal.prepare);
+
+	if (test_set_buffer_locked(bh)) {
+		if (!wait)
+			return 0;
+		lock_buffer(bh);
+	}
+	set_buffer_journal_prepared(bh);
+	if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) {
+		clear_buffer_journal_test(bh);
+		set_buffer_journal_restore_dirty(bh);
+	}
+	unlock_buffer(bh);
+	return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	struct reiserfs_journal_list *jl;
+	struct list_head *entry;
+	time_t now = get_seconds();
+
+	while (!list_empty(&journal->j_journal_list)) {
+		entry = journal->j_journal_list.next;
+		jl = JOURNAL_LIST_ENTRY(entry);
+		/* this check should always be run, to send old lists to disk */
+		if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+			flush_used_journal_lists(s, jl);
+		} else {
+			break;
+		}
 	}
-    }
 }
 
 /* 
@@ -3514,375 +3840,390 @@ static void flush_old_journal_lists(struct super_block *s) {
 ** If the journal is aborted, we just clean up. Things like flushing
 ** journal lists, etc just won't happen.
 */
-static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks, 
-		          int flags) {
-  struct reiserfs_journal *journal = SB_JOURNAL (p_s_sb);
-  struct reiserfs_journal_cnode *cn, *next, *jl_cn; 
-  struct reiserfs_journal_cnode *last_cn = NULL;
-  struct reiserfs_journal_desc *desc ; 
-  struct reiserfs_journal_commit *commit ; 
-  struct buffer_head *c_bh ; /* commit bh */
-  struct buffer_head *d_bh ; /* desc bh */
-  int cur_write_start = 0 ; /* start index of current log write */
-  int old_start ;
-  int i ;
-  int flush = flags & FLUSH_ALL ;
-  int wait_on_commit = flags & WAIT ;
-  struct reiserfs_journal_list *jl, *temp_jl;
-  struct list_head *entry, *safe;
-  unsigned long jindex;
-  unsigned long commit_trans_id;
-  int trans_half;
-
-  BUG_ON (th->t_refcount > 1);
-  BUG_ON (!th->t_trans_id);
-
-  put_fs_excl();
-  current->journal_info = th->t_handle_save;
-  reiserfs_check_lock_depth(p_s_sb, "journal end");
-  if (journal->j_len == 0) {
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-  }
-
-  lock_journal(p_s_sb) ;
-  if (journal->j_next_full_flush) {
-    flags |= FLUSH_ALL ;
-    flush = 1 ;
-  }
-  if (journal->j_next_async_flush) {
-    flags |= COMMIT_NOW | WAIT;
-    wait_on_commit = 1;
-  }
-
-  /* check_journal_end locks the journal, and unlocks if it does not return 1 
-  ** it tells us if we should continue with the journal_end, or just return
-  */
-  if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    p_s_sb->s_dirt = 1;
-    wake_queued_writers(p_s_sb);
-    reiserfs_async_progress_wait(p_s_sb);
-    goto out ;
-  }
-
-  /* check_journal_end might set these, check again */
-  if (journal->j_next_full_flush) {
-    flush = 1 ;
-  }
-
-  /*
-  ** j must wait means we have to flush the log blocks, and the real blocks for
-  ** this transaction
-  */
-  if (journal->j_must_wait > 0) {
-    flush = 1 ;
-  }
+static int do_journal_end(struct reiserfs_transaction_handle *th,
+			  struct super_block *p_s_sb, unsigned long nblocks,
+			  int flags)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+	struct reiserfs_journal_cnode *cn, *next, *jl_cn;
+	struct reiserfs_journal_cnode *last_cn = NULL;
+	struct reiserfs_journal_desc *desc;
+	struct reiserfs_journal_commit *commit;
+	struct buffer_head *c_bh;	/* commit bh */
+	struct buffer_head *d_bh;	/* desc bh */
+	int cur_write_start = 0;	/* start index of current log write */
+	int old_start;
+	int i;
+	int flush = flags & FLUSH_ALL;
+	int wait_on_commit = flags & WAIT;
+	struct reiserfs_journal_list *jl, *temp_jl;
+	struct list_head *entry, *safe;
+	unsigned long jindex;
+	unsigned long commit_trans_id;
+	int trans_half;
+
+	BUG_ON(th->t_refcount > 1);
+	BUG_ON(!th->t_trans_id);
+
+	put_fs_excl();
+	current->journal_info = th->t_handle_save;
+	reiserfs_check_lock_depth(p_s_sb, "journal end");
+	if (journal->j_len == 0) {
+		reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb),
+					     1);
+		journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb));
+	}
 
+	lock_journal(p_s_sb);
+	if (journal->j_next_full_flush) {
+		flags |= FLUSH_ALL;
+		flush = 1;
+	}
+	if (journal->j_next_async_flush) {
+		flags |= COMMIT_NOW | WAIT;
+		wait_on_commit = 1;
+	}
+
+	/* check_journal_end locks the journal, and unlocks if it does not return 1 
+	 ** it tells us if we should continue with the journal_end, or just return
+	 */
+	if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
+		p_s_sb->s_dirt = 1;
+		wake_queued_writers(p_s_sb);
+		reiserfs_async_progress_wait(p_s_sb);
+		goto out;
+	}
+
+	/* check_journal_end might set these, check again */
+	if (journal->j_next_full_flush) {
+		flush = 1;
+	}
+
+	/*
+	 ** j must wait means we have to flush the log blocks, and the real blocks for
+	 ** this transaction
+	 */
+	if (journal->j_must_wait > 0) {
+		flush = 1;
+	}
 #ifdef REISERFS_PREALLOCATE
-  /* quota ops might need to nest, setup the journal_info pointer for them */
-  current->journal_info = th ;
-  reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
-				      * the transaction */
-  current->journal_info = th->t_handle_save ;
+	/* quota ops might need to nest, setup the journal_info pointer for them */
+	current->journal_info = th;
+	reiserfs_discard_all_prealloc(th);	/* it should not involve new blocks into
+						 * the transaction */
+	current->journal_info = th->t_handle_save;
 #endif
-  
-  /* setup description block */
-  d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + journal->j_start) ;
-  set_buffer_uptodate(d_bh);
-  desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
-  memset(d_bh->b_data, 0, d_bh->b_size) ;
-  memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
-  set_desc_trans_id(desc, journal->j_trans_id) ;
-
-  /* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
-  c_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
-		 ((journal->j_start + journal->j_len + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-  commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
-  memset(c_bh->b_data, 0, c_bh->b_size) ;
-  set_commit_trans_id(commit, journal->j_trans_id) ;
-  set_buffer_uptodate(c_bh) ;
-
-  /* init this journal list */
-  jl = journal->j_current_jl;
-
-  /* we lock the commit before doing anything because
-   * we want to make sure nobody tries to run flush_commit_list until
-   * the new transaction is fully setup, and we've already flushed the
-   * ordered bh list
-   */
-  down(&jl->j_commit_lock);
-
-  /* save the transaction id in case we need to commit it later */
-  commit_trans_id = jl->j_trans_id;
-
-  atomic_set(&jl->j_older_commits_done, 0) ;
-  jl->j_trans_id = journal->j_trans_id ;
-  jl->j_timestamp = journal->j_trans_start_time ;
-  jl->j_commit_bh = c_bh ;
-  jl->j_start = journal->j_start ;
-  jl->j_len = journal->j_len ;
-  atomic_set(&jl->j_nonzerolen, journal->j_len) ;
-  atomic_set(&jl->j_commit_left, journal->j_len + 2);
-  jl->j_realblock = NULL ;
-
-  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
-  **  for each real block, add it to the journal list hash,
-  ** copy into real block index array in the commit or desc block
-  */
-  trans_half = journal_trans_half(p_s_sb->s_blocksize);
-  for (i = 0, cn = journal->j_first ; cn ; cn = cn->next, i++) {
-    if (buffer_journaled (cn->bh)) {
-      jl_cn = get_cnode(p_s_sb) ;
-      if (!jl_cn) {
-        reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
-      }
-      if (i == 0) {
-        jl->j_realblock = jl_cn ;
-      }
-      jl_cn->prev = last_cn ;
-      jl_cn->next = NULL ;
-      if (last_cn) {
-        last_cn->next = jl_cn ;
-      }
-      last_cn = jl_cn ;
-      /* make sure the block we are trying to log is not a block 
-         of journal or reserved area */
-
-      if (is_block_in_log_or_reserved_area(p_s_sb, cn->bh->b_blocknr)) {
-        reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ;
-      }
-      jl_cn->blocknr = cn->bh->b_blocknr ; 
-      jl_cn->state = 0 ;
-      jl_cn->sb = p_s_sb;
-      jl_cn->bh = cn->bh ;
-      jl_cn->jlist = jl;
-      insert_journal_hash(journal->j_list_hash_table, jl_cn) ;
-      if (i < trans_half) {
-	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
-      } else {
-	commit->j_realblock[i - trans_half] = cpu_to_le32(cn->bh->b_blocknr) ;
-      }
-    } else {
-      i-- ;
-    }
-  }
-  set_desc_trans_len(desc, journal->j_len) ;
-  set_desc_mount_id(desc, journal->j_mount_id) ;
-  set_desc_trans_id(desc, journal->j_trans_id) ;
-  set_commit_trans_len(commit, journal->j_len);
-
-  /* special check in case all buffers in the journal were marked for not logging */
-  if (journal->j_len == 0) {
-    BUG();
-  }
-
-  /* we're about to dirty all the log blocks, mark the description block
-   * dirty now too.  Don't mark the commit block dirty until all the
-   * others are on disk
-   */
-  mark_buffer_dirty(d_bh);
-
-  /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
-  cur_write_start = journal->j_start ;
-  cn = journal->j_first ;
-  jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cn) {
-    clear_buffer_journal_new (cn->bh);
-    /* copy all the real blocks into log area.  dirty log blocks */
-    if (buffer_journaled (cn->bh)) {
-      struct buffer_head *tmp_bh ;
-      char *addr;
-      struct page *page;
-      tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
-		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-      set_buffer_uptodate(tmp_bh);
-      page = cn->bh->b_page;
-      addr = kmap(page);
-      memcpy(tmp_bh->b_data, addr + offset_in_page(cn->bh->b_data),
-             cn->bh->b_size);
-      kunmap(page);
-      mark_buffer_dirty(tmp_bh);
-      jindex++ ;
-      set_buffer_journal_dirty (cn->bh);
-      clear_buffer_journaled (cn->bh);
-    } else {
-      /* JDirty cleared sometime during transaction.  don't log this one */
-      reiserfs_warning(p_s_sb, "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!") ;
-      brelse(cn->bh) ;
-    }
-    next = cn->next ;
-    free_cnode(p_s_sb, cn) ;
-    cn = next ;
-    cond_resched();
-  }
-
-  /* we are done  with both the c_bh and d_bh, but
-  ** c_bh must be written after all other commit blocks,
-  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-  */
-
-  journal->j_current_jl = alloc_journal_list(p_s_sb);
-
-  /* now it is safe to insert this transaction on the main list */
-  list_add_tail(&jl->j_list, &journal->j_journal_list);
-  list_add_tail(&jl->j_working_list, &journal->j_working_list);
-  journal->j_num_work_lists++;
-
-  /* reset journal values for the next transaction */
-  old_start = journal->j_start ;
-  journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
-  atomic_set(&(journal->j_wcount), 0) ;
-  journal->j_bcount = 0 ;
-  journal->j_last = NULL ;
-  journal->j_first = NULL ;
-  journal->j_len = 0 ;
-  journal->j_trans_start_time = 0 ;
-  journal->j_trans_id++ ;
-  journal->j_current_jl->j_trans_id = journal->j_trans_id;
-  journal->j_must_wait = 0 ;
-  journal->j_len_alloc = 0 ;
-  journal->j_next_full_flush = 0 ;
-  journal->j_next_async_flush = 0 ;
-  init_journal_hash(p_s_sb) ; 
-
-  // make sure reiserfs_add_jh sees the new current_jl before we
-  // write out the tails
-  smp_mb();
-
-  /* tail conversion targets have to hit the disk before we end the
-   * transaction.  Otherwise a later transaction might repack the tail
-   * before this transaction commits, leaving the data block unflushed and
-   * clean, if we crash before the later transaction commits, the data block
-   * is lost.
-   */
-  if (!list_empty(&jl->j_tail_bh_list)) {
-      unlock_kernel();
-      write_ordered_buffers(&journal->j_dirty_buffers_lock,
-			    journal, jl, &jl->j_tail_bh_list);
-      lock_kernel();
-  }
-  if (!list_empty(&jl->j_tail_bh_list))
-      BUG();
-  up(&jl->j_commit_lock);
-
-  /* honor the flush wishes from the caller, simple commits can
-  ** be done outside the journal lock, they are done below
-  **
-  ** if we don't flush the commit list right now, we put it into
-  ** the work queue so the people waiting on the async progress work
-  ** queue don't wait for this proc to flush journal lists and such.
-  */
-  if (flush) {
-    flush_commit_list(p_s_sb, jl, 1) ;
-    flush_journal_list(p_s_sb, jl, 1) ;
-  } else if (!(jl->j_state & LIST_COMMIT_PENDING))
-    queue_delayed_work(commit_wq, &journal->j_work, HZ/10);
-
-
-  /* if the next transaction has any chance of wrapping, flush 
-  ** transactions that might get overwritten.  If any journal lists are very 
-  ** old flush them as well.  
-  */
-first_jl:
-  list_for_each_safe(entry, safe, &journal->j_journal_list) {
-    temp_jl = JOURNAL_LIST_ENTRY(entry);
-    if (journal->j_start <= temp_jl->j_start) {
-      if ((journal->j_start + journal->j_trans_max + 1) >=
-          temp_jl->j_start)
-      {
-	flush_used_journal_lists(p_s_sb, temp_jl);
-	goto first_jl;
-      } else if ((journal->j_start +
-                  journal->j_trans_max + 1) <
-		  SB_ONDISK_JOURNAL_SIZE(p_s_sb))
-      {
-          /* if we don't cross into the next transaction and we don't
-	   * wrap, there is no way we can overlap any later transactions
-	   * break now
-	   */
-	  break;
-      }
-    } else if ((journal->j_start +
-                journal->j_trans_max + 1) >
-		SB_ONDISK_JOURNAL_SIZE(p_s_sb))
-    {
-      if (((journal->j_start + journal->j_trans_max + 1) %
-            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
-      {
-	flush_used_journal_lists(p_s_sb, temp_jl);
-	goto first_jl;
-      } else {
-	  /* we don't overlap anything from out start to the end of the
-	   * log, and our wrapped portion doesn't overlap anything at
-	   * the start of the log.  We can break
-	   */
-	  break;
-      }
-    }
-  }
-  flush_old_journal_lists(p_s_sb);
-
-  journal->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, journal->j_current_jl) ;
-
-  if (!(journal->j_current_jl->j_list_bitmap)) {
-    reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
-  }
-
-  atomic_set(&(journal->j_jlock), 0) ;
-  unlock_journal(p_s_sb) ;
-  /* wake up any body waiting to join. */
-  clear_bit(J_WRITERS_QUEUED, &journal->j_state);
-  wake_up(&(journal->j_join_wait)) ;
-
-  if (!flush && wait_on_commit &&
-      journal_list_still_alive(p_s_sb, commit_trans_id)) {
-	  flush_commit_list(p_s_sb, jl, 1) ;
-  }
-out:
-  reiserfs_check_lock_depth(p_s_sb, "journal end2");
-
-  memset (th, 0, sizeof (*th));
-  /* Re-set th->t_super, so we can properly keep track of how many
-   * persistent transactions there are. We need to do this so if this
-   * call is part of a failed restart_transaction, we can free it later */
-  th->t_super = p_s_sb;
-
-  return journal->j_errno;
-}
-
-static void
-__reiserfs_journal_abort_hard (struct super_block *sb)
-{
-    struct reiserfs_journal *journal = SB_JOURNAL (sb);
-    if (test_bit (J_ABORTED, &journal->j_state))
-        return;
-
-    printk (KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
-                      reiserfs_bdevname (sb));
-
-    sb->s_flags |= MS_RDONLY;
-    set_bit (J_ABORTED, &journal->j_state);
+
+	/* setup description block */
+	d_bh =
+	    journal_getblk(p_s_sb,
+			   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			   journal->j_start);
+	set_buffer_uptodate(d_bh);
+	desc = (struct reiserfs_journal_desc *)(d_bh)->b_data;
+	memset(d_bh->b_data, 0, d_bh->b_size);
+	memcpy(get_journal_desc_magic(d_bh), JOURNAL_DESC_MAGIC, 8);
+	set_desc_trans_id(desc, journal->j_trans_id);
+
+	/* setup commit block.  Don't write (keep it clean too) this one until after everyone else is written */
+	c_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+			      ((journal->j_start + journal->j_len +
+				1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+	commit = (struct reiserfs_journal_commit *)c_bh->b_data;
+	memset(c_bh->b_data, 0, c_bh->b_size);
+	set_commit_trans_id(commit, journal->j_trans_id);
+	set_buffer_uptodate(c_bh);
+
+	/* init this journal list */
+	jl = journal->j_current_jl;
+
+	/* we lock the commit before doing anything because
+	 * we want to make sure nobody tries to run flush_commit_list until
+	 * the new transaction is fully setup, and we've already flushed the
+	 * ordered bh list
+	 */
+	down(&jl->j_commit_lock);
+
+	/* save the transaction id in case we need to commit it later */
+	commit_trans_id = jl->j_trans_id;
+
+	atomic_set(&jl->j_older_commits_done, 0);
+	jl->j_trans_id = journal->j_trans_id;
+	jl->j_timestamp = journal->j_trans_start_time;
+	jl->j_commit_bh = c_bh;
+	jl->j_start = journal->j_start;
+	jl->j_len = journal->j_len;
+	atomic_set(&jl->j_nonzerolen, journal->j_len);
+	atomic_set(&jl->j_commit_left, journal->j_len + 2);
+	jl->j_realblock = NULL;
+
+	/* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+	 **  for each real block, add it to the journal list hash,
+	 ** copy into real block index array in the commit or desc block
+	 */
+	trans_half = journal_trans_half(p_s_sb->s_blocksize);
+	for (i = 0, cn = journal->j_first; cn; cn = cn->next, i++) {
+		if (buffer_journaled(cn->bh)) {
+			jl_cn = get_cnode(p_s_sb);
+			if (!jl_cn) {
+				reiserfs_panic(p_s_sb,
+					       "journal-1676, get_cnode returned NULL\n");
+			}
+			if (i == 0) {
+				jl->j_realblock = jl_cn;
+			}
+			jl_cn->prev = last_cn;
+			jl_cn->next = NULL;
+			if (last_cn) {
+				last_cn->next = jl_cn;
+			}
+			last_cn = jl_cn;
+			/* make sure the block we are trying to log is not a block 
+			   of journal or reserved area */
+
+			if (is_block_in_log_or_reserved_area
+			    (p_s_sb, cn->bh->b_blocknr)) {
+				reiserfs_panic(p_s_sb,
+					       "journal-2332: Trying to log block %lu, which is a log block\n",
+					       cn->bh->b_blocknr);
+			}
+			jl_cn->blocknr = cn->bh->b_blocknr;
+			jl_cn->state = 0;
+			jl_cn->sb = p_s_sb;
+			jl_cn->bh = cn->bh;
+			jl_cn->jlist = jl;
+			insert_journal_hash(journal->j_list_hash_table, jl_cn);
+			if (i < trans_half) {
+				desc->j_realblock[i] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			} else {
+				commit->j_realblock[i - trans_half] =
+				    cpu_to_le32(cn->bh->b_blocknr);
+			}
+		} else {
+			i--;
+		}
+	}
+	set_desc_trans_len(desc, journal->j_len);
+	set_desc_mount_id(desc, journal->j_mount_id);
+	set_desc_trans_id(desc, journal->j_trans_id);
+	set_commit_trans_len(commit, journal->j_len);
+
+	/* special check in case all buffers in the journal were marked for not logging */
+	if (journal->j_len == 0) {
+		BUG();
+	}
+
+	/* we're about to dirty all the log blocks, mark the description block
+	 * dirty now too.  Don't mark the commit block dirty until all the
+	 * others are on disk
+	 */
+	mark_buffer_dirty(d_bh);
+
+	/* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
+	cur_write_start = journal->j_start;
+	cn = journal->j_first;
+	jindex = 1;		/* start at one so we don't get the desc again */
+	while (cn) {
+		clear_buffer_journal_new(cn->bh);
+		/* copy all the real blocks into log area.  dirty log blocks */
+		if (buffer_journaled(cn->bh)) {
+			struct buffer_head *tmp_bh;
+			char *addr;
+			struct page *page;
+			tmp_bh =
+			    journal_getblk(p_s_sb,
+					   SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) +
+					   ((cur_write_start +
+					     jindex) %
+					    SB_ONDISK_JOURNAL_SIZE(p_s_sb)));
+			set_buffer_uptodate(tmp_bh);
+			page = cn->bh->b_page;
+			addr = kmap(page);
+			memcpy(tmp_bh->b_data,
+			       addr + offset_in_page(cn->bh->b_data),
+			       cn->bh->b_size);
+			kunmap(page);
+			mark_buffer_dirty(tmp_bh);
+			jindex++;
+			set_buffer_journal_dirty(cn->bh);
+			clear_buffer_journaled(cn->bh);
+		} else {
+			/* JDirty cleared sometime during transaction.  don't log this one */
+			reiserfs_warning(p_s_sb,
+					 "journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!");
+			brelse(cn->bh);
+		}
+		next = cn->next;
+		free_cnode(p_s_sb, cn);
+		cn = next;
+		cond_resched();
+	}
+
+	/* we are done  with both the c_bh and d_bh, but
+	 ** c_bh must be written after all other commit blocks,
+	 ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+	 */
+
+	journal->j_current_jl = alloc_journal_list(p_s_sb);
+
+	/* now it is safe to insert this transaction on the main list */
+	list_add_tail(&jl->j_list, &journal->j_journal_list);
+	list_add_tail(&jl->j_working_list, &journal->j_working_list);
+	journal->j_num_work_lists++;
+
+	/* reset journal values for the next transaction */
+	old_start = journal->j_start;
+	journal->j_start =
+	    (journal->j_start + journal->j_len +
+	     2) % SB_ONDISK_JOURNAL_SIZE(p_s_sb);
+	atomic_set(&(journal->j_wcount), 0);
+	journal->j_bcount = 0;
+	journal->j_last = NULL;
+	journal->j_first = NULL;
+	journal->j_len = 0;
+	journal->j_trans_start_time = 0;
+	journal->j_trans_id++;
+	journal->j_current_jl->j_trans_id = journal->j_trans_id;
+	journal->j_must_wait = 0;
+	journal->j_len_alloc = 0;
+	journal->j_next_full_flush = 0;
+	journal->j_next_async_flush = 0;
+	init_journal_hash(p_s_sb);
+
+	// make sure reiserfs_add_jh sees the new current_jl before we
+	// write out the tails
+	smp_mb();
+
+	/* tail conversion targets have to hit the disk before we end the
+	 * transaction.  Otherwise a later transaction might repack the tail
+	 * before this transaction commits, leaving the data block unflushed and
+	 * clean, if we crash before the later transaction commits, the data block
+	 * is lost.
+	 */
+	if (!list_empty(&jl->j_tail_bh_list)) {
+		unlock_kernel();
+		write_ordered_buffers(&journal->j_dirty_buffers_lock,
+				      journal, jl, &jl->j_tail_bh_list);
+		lock_kernel();
+	}
+	if (!list_empty(&jl->j_tail_bh_list))
+		BUG();
+	up(&jl->j_commit_lock);
+
+	/* honor the flush wishes from the caller, simple commits can
+	 ** be done outside the journal lock, they are done below
+	 **
+	 ** if we don't flush the commit list right now, we put it into
+	 ** the work queue so the people waiting on the async progress work
+	 ** queue don't wait for this proc to flush journal lists and such.
+	 */
+	if (flush) {
+		flush_commit_list(p_s_sb, jl, 1);
+		flush_journal_list(p_s_sb, jl, 1);
+	} else if (!(jl->j_state & LIST_COMMIT_PENDING))
+		queue_delayed_work(commit_wq, &journal->j_work, HZ / 10);
+
+	/* if the next transaction has any chance of wrapping, flush 
+	 ** transactions that might get overwritten.  If any journal lists are very 
+	 ** old flush them as well.  
+	 */
+      first_jl:
+	list_for_each_safe(entry, safe, &journal->j_journal_list) {
+		temp_jl = JOURNAL_LIST_ENTRY(entry);
+		if (journal->j_start <= temp_jl->j_start) {
+			if ((journal->j_start + journal->j_trans_max + 1) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(p_s_sb, temp_jl);
+				goto first_jl;
+			} else if ((journal->j_start +
+				    journal->j_trans_max + 1) <
+				   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+				/* if we don't cross into the next transaction and we don't
+				 * wrap, there is no way we can overlap any later transactions
+				 * break now
+				 */
+				break;
+			}
+		} else if ((journal->j_start +
+			    journal->j_trans_max + 1) >
+			   SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
+			if (((journal->j_start + journal->j_trans_max + 1) %
+			     SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >=
+			    temp_jl->j_start) {
+				flush_used_journal_lists(p_s_sb, temp_jl);
+				goto first_jl;
+			} else {
+				/* we don't overlap anything from out start to the end of the
+				 * log, and our wrapped portion doesn't overlap anything at
+				 * the start of the log.  We can break
+				 */
+				break;
+			}
+		}
+	}
+	flush_old_journal_lists(p_s_sb);
+
+	journal->j_current_jl->j_list_bitmap =
+	    get_list_bitmap(p_s_sb, journal->j_current_jl);
+
+	if (!(journal->j_current_jl->j_list_bitmap)) {
+		reiserfs_panic(p_s_sb,
+			       "journal-1996: do_journal_end, could not get a list bitmap\n");
+	}
+
+	atomic_set(&(journal->j_jlock), 0);
+	unlock_journal(p_s_sb);
+	/* wake up any body waiting to join. */
+	clear_bit(J_WRITERS_QUEUED, &journal->j_state);
+	wake_up(&(journal->j_join_wait));
+
+	if (!flush && wait_on_commit &&
+	    journal_list_still_alive(p_s_sb, commit_trans_id)) {
+		flush_commit_list(p_s_sb, jl, 1);
+	}
+      out:
+	reiserfs_check_lock_depth(p_s_sb, "journal end2");
+
+	memset(th, 0, sizeof(*th));
+	/* Re-set th->t_super, so we can properly keep track of how many
+	 * persistent transactions there are. We need to do this so if this
+	 * call is part of a failed restart_transaction, we can free it later */
+	th->t_super = p_s_sb;
+
+	return journal->j_errno;
+}
+
+static void __reiserfs_journal_abort_hard(struct super_block *sb)
+{
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	if (test_bit(J_ABORTED, &journal->j_state))
+		return;
+
+	printk(KERN_CRIT "REISERFS: Aborting journal for filesystem on %s\n",
+	       reiserfs_bdevname(sb));
+
+	sb->s_flags |= MS_RDONLY;
+	set_bit(J_ABORTED, &journal->j_state);
 
 #ifdef CONFIG_REISERFS_CHECK
-    dump_stack();
+	dump_stack();
 #endif
 }
 
-static void
-__reiserfs_journal_abort_soft (struct super_block *sb, int errno)
+static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno)
 {
-    struct reiserfs_journal *journal = SB_JOURNAL (sb);
-    if (test_bit (J_ABORTED, &journal->j_state))
-        return;
+	struct reiserfs_journal *journal = SB_JOURNAL(sb);
+	if (test_bit(J_ABORTED, &journal->j_state))
+		return;
 
-    if (!journal->j_errno)
-        journal->j_errno = errno;
+	if (!journal->j_errno)
+		journal->j_errno = errno;
 
-    __reiserfs_journal_abort_hard (sb);
+	__reiserfs_journal_abort_hard(sb);
 }
 
-void
-reiserfs_journal_abort (struct super_block *sb, int errno)
+void reiserfs_journal_abort(struct super_block *sb, int errno)
 {
-    return __reiserfs_journal_abort_soft (sb, errno);
+	return __reiserfs_journal_abort_soft(sb, errno);
 }
diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c
index 2406608fc5cd..2533c1f64aba 100644
--- a/fs/reiserfs/lbalance.c
+++ b/fs/reiserfs/lbalance.c
@@ -21,648 +21,709 @@
    leaf_paste_entries
    */
 
-
 /* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */
-static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source, 
-				   int last_first, int item_num, int from, int copy_count)
+static void leaf_copy_dir_entries(struct buffer_info *dest_bi,
+				  struct buffer_head *source, int last_first,
+				  int item_num, int from, int copy_count)
 {
-    struct buffer_head * dest = dest_bi->bi_bh;
-    int item_num_in_dest;		/* either the number of target item,
-					   or if we must create a new item,
-					   the number of the item we will
-					   create it next to */
-    struct item_head * ih;
-    struct reiserfs_de_head * deh;
-    int copy_records_len;			/* length of all records in item to be copied */
-    char * records;
-
-    ih = B_N_PITEM_HEAD (source, item_num);
-
-    RFALSE( !is_direntry_le_ih (ih), "vs-10000: item must be directory item");
-
-    /* length of all record to be copied and first byte of the last of them */
-    deh = B_I_DEH (source, ih);
-    if (copy_count) {
-	copy_records_len = (from ? deh_location( &(deh[from - 1]) ) :
-            ih_item_len(ih)) - deh_location( &(deh[from + copy_count - 1]));
-	records = source->b_data + ih_location(ih) +
-                                deh_location( &(deh[from + copy_count - 1]));
-    } else {
-	copy_records_len = 0;
-	records = NULL;
-    }
-
-    /* when copy last to first, dest buffer can contain 0 items */
-    item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1);
-
-    /* if there are no items in dest or the first/last item in dest is not item of the same directory */
-    if ( (item_num_in_dest == - 1) ||
-	(last_first == FIRST_TO_LAST && le_ih_k_offset (ih) == DOT_OFFSET) ||
-	    (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) {
-	/* create new item in dest */
-	struct item_head new_ih;
-
-	/* form item header */
-	memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
-	put_ih_version( &new_ih, KEY_FORMAT_3_5 );
-	/* calculate item len */
-	put_ih_item_len( &new_ih, DEH_SIZE * copy_count + copy_records_len );
-	put_ih_entry_count( &new_ih, 0 );
-    
-	if (last_first == LAST_TO_FIRST) {
-	    /* form key by the following way */
-	    if (from < I_ENTRY_COUNT(ih)) {
-		set_le_ih_k_offset( &new_ih, deh_offset( &(deh[from]) ) );
-		/*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/
-	    } else {
-		/* no entries will be copied to this item in this function */
-		set_le_ih_k_offset (&new_ih, U32_MAX);
-		/* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
-	    }
-	    set_le_key_k_type (KEY_FORMAT_3_5, &(new_ih.ih_key), TYPE_DIRENTRY);
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int item_num_in_dest;	/* either the number of target item,
+				   or if we must create a new item,
+				   the number of the item we will
+				   create it next to */
+	struct item_head *ih;
+	struct reiserfs_de_head *deh;
+	int copy_records_len;	/* length of all records in item to be copied */
+	char *records;
+
+	ih = B_N_PITEM_HEAD(source, item_num);
+
+	RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item");
+
+	/* length of all record to be copied and first byte of the last of them */
+	deh = B_I_DEH(source, ih);
+	if (copy_count) {
+		copy_records_len = (from ? deh_location(&(deh[from - 1])) :
+				    ih_item_len(ih)) -
+		    deh_location(&(deh[from + copy_count - 1]));
+		records =
+		    source->b_data + ih_location(ih) +
+		    deh_location(&(deh[from + copy_count - 1]));
+	} else {
+		copy_records_len = 0;
+		records = NULL;
+	}
+
+	/* when copy last to first, dest buffer can contain 0 items */
+	item_num_in_dest =
+	    (last_first ==
+	     LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest)
+							       - 1);
+
+	/* if there are no items in dest or the first/last item in dest is not item of the same directory */
+	if ((item_num_in_dest == -1) ||
+	    (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) ||
+	    (last_first == LAST_TO_FIRST
+	     && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key,
+							 B_N_PKEY(dest,
+								  item_num_in_dest))))
+	{
+		/* create new item in dest */
+		struct item_head new_ih;
+
+		/* form item header */
+		memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE);
+		put_ih_version(&new_ih, KEY_FORMAT_3_5);
+		/* calculate item len */
+		put_ih_item_len(&new_ih,
+				DEH_SIZE * copy_count + copy_records_len);
+		put_ih_entry_count(&new_ih, 0);
+
+		if (last_first == LAST_TO_FIRST) {
+			/* form key by the following way */
+			if (from < I_ENTRY_COUNT(ih)) {
+				set_le_ih_k_offset(&new_ih,
+						   deh_offset(&(deh[from])));
+				/*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE); */
+			} else {
+				/* no entries will be copied to this item in this function */
+				set_le_ih_k_offset(&new_ih, U32_MAX);
+				/* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */
+			}
+			set_le_key_k_type(KEY_FORMAT_3_5, &(new_ih.ih_key),
+					  TYPE_DIRENTRY);
+		}
+
+		/* insert item into dest buffer */
+		leaf_insert_into_buf(dest_bi,
+				     (last_first ==
+				      LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest),
+				     &new_ih, NULL, 0);
+	} else {
+		/* prepare space for entries */
+		leaf_paste_in_buffer(dest_bi,
+				     (last_first ==
+				      FIRST_TO_LAST) ? (B_NR_ITEMS(dest) -
+							1) : 0, MAX_US_INT,
+				     DEH_SIZE * copy_count + copy_records_len,
+				     records, 0);
 	}
-    
-	/* insert item into dest buffer */
-	leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0);
-    } else {
-	/* prepare space for entries */
-	leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT,
-			      DEH_SIZE * copy_count + copy_records_len, records, 0
-	    );
-    }
-  
-    item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0;
-    
-    leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest,
-			(last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0,
-			copy_count, deh + from, records,
-			DEH_SIZE * copy_count + copy_records_len
-	);
-}
 
+	item_num_in_dest =
+	    (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0;
+
+	leaf_paste_entries(dest_bi->bi_bh, item_num_in_dest,
+			   (last_first ==
+			    FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD(dest,
+									  item_num_in_dest))
+			   : 0, copy_count, deh + from, records,
+			   DEH_SIZE * copy_count + copy_records_len);
+}
 
 /* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or 
    part of it or nothing (see the return 0 below) from SOURCE to the end 
    (if last_first) or beginning (!last_first) of the DEST */
 /* returns 1 if anything was copied, else 0 */
-static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
-				    int bytes_or_entries)
+static int leaf_copy_boundary_item(struct buffer_info *dest_bi,
+				   struct buffer_head *src, int last_first,
+				   int bytes_or_entries)
 {
-  struct buffer_head * dest = dest_bi->bi_bh;
-  int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */
-  struct item_head * ih;
-  struct item_head * dih;
-  
-  dest_nr_item = B_NR_ITEMS(dest);
-  
-  if ( last_first == FIRST_TO_LAST ) {
-    /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
-       or of different types ) then there is no need to treat this item differently from the other items
-       that we copy, so we return */
-    ih = B_N_PITEM_HEAD (src, 0);
-    dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1);
-    if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size)))
-      /* there is nothing to merge */
-      return 0;
-      
-    RFALSE( ! ih_item_len(ih), "vs-10010: item can not have empty length");
-      
-    if ( is_direntry_le_ih (ih) ) {
-      if ( bytes_or_entries == -1 )
-	/* copy all entries to dest */
-	bytes_or_entries = ih_entry_count(ih);
-      leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries);
-      return 1;
-    }
-      
-    /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
-       part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
-       */
-    if ( bytes_or_entries == -1 )
-      bytes_or_entries = ih_item_len(ih);
+	struct buffer_head *dest = dest_bi->bi_bh;
+	int dest_nr_item, src_nr_item;	/* number of items in the source and destination buffers */
+	struct item_head *ih;
+	struct item_head *dih;
+
+	dest_nr_item = B_NR_ITEMS(dest);
+
+	if (last_first == FIRST_TO_LAST) {
+		/* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects
+		   or of different types ) then there is no need to treat this item differently from the other items
+		   that we copy, so we return */
+		ih = B_N_PITEM_HEAD(src, 0);
+		dih = B_N_PITEM_HEAD(dest, dest_nr_item - 1);
+		if (!dest_nr_item
+		    || (!op_is_left_mergeable(&(ih->ih_key), src->b_size)))
+			/* there is nothing to merge */
+			return 0;
+
+		RFALSE(!ih_item_len(ih),
+		       "vs-10010: item can not have empty length");
+
+		if (is_direntry_le_ih(ih)) {
+			if (bytes_or_entries == -1)
+				/* copy all entries to dest */
+				bytes_or_entries = ih_entry_count(ih);
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0,
+					      bytes_or_entries);
+			return 1;
+		}
+
+		/* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST
+		   part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header
+		 */
+		if (bytes_or_entries == -1)
+			bytes_or_entries = ih_item_len(ih);
 
 #ifdef CONFIG_REISERFS_CHECK
-    else {
-      if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih))
-	if (get_ih_free_space (ih))
-	  reiserfs_panic (NULL, "vs-10020: leaf_copy_boundary_item: "
-			  "last unformatted node must be filled entirely (%h)",
-			  ih);
-    }
+		else {
+			if (bytes_or_entries == ih_item_len(ih)
+			    && is_indirect_le_ih(ih))
+				if (get_ih_free_space(ih))
+					reiserfs_panic(NULL,
+						       "vs-10020: leaf_copy_boundary_item: "
+						       "last unformatted node must be filled entirely (%h)",
+						       ih);
+		}
 #endif
-      
-    /* merge first item (or its part) of src buffer with the last
-       item of dest buffer. Both are of the same file */
-    leaf_paste_in_buffer (dest_bi,
-			  dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, B_I_PITEM(src,ih), 0
-			  );
-      
-    if (is_indirect_le_ih (dih)) {
-      RFALSE( get_ih_free_space (dih),
-              "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
-              ih);
-      if (bytes_or_entries == ih_item_len(ih))
-	set_ih_free_space (dih, get_ih_free_space (ih));
-    }
-    
-    return 1;
-  }
-  
-
-  /* copy boundary item to right (last_first == LAST_TO_FIRST) */
-
-  /* ( DEST is empty or last item of SOURCE and first item of DEST
-     are the items of different object or of different types )
-     */
-  src_nr_item = B_NR_ITEMS (src);
-  ih = B_N_PITEM_HEAD (src, src_nr_item - 1);
-  dih = B_N_PITEM_HEAD (dest, 0);
-
-  if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size))
-    return 0;
-  
-  if ( is_direntry_le_ih (ih)) {
-    if ( bytes_or_entries == -1 )
-      /* bytes_or_entries = entries number in last item body of SOURCE */
-      bytes_or_entries = ih_entry_count(ih);
-    
-    leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries);
-    return 1;
-  }
-
-  /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
-     part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
-     don't create new item header
-     */
-  
-  RFALSE( is_indirect_le_ih(ih) && get_ih_free_space (ih),
-          "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
-		    ih);
-
-  if ( bytes_or_entries == -1 ) {
-    /* bytes_or_entries = length of last item body of SOURCE */
-    bytes_or_entries = ih_item_len(ih);
-
-    RFALSE( le_ih_k_offset (dih) !=
-            le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size),
-            "vs-10050: items %h and %h do not match", ih, dih);
-
-    /* change first item key of the DEST */
-    set_le_ih_k_offset (dih, le_ih_k_offset (ih));
-
-    /* item becomes non-mergeable */
-    /* or mergeable if left item was */
-    set_le_ih_k_type (dih, le_ih_k_type (ih));
-  } else {
-    /* merge to right only part of item */
-    RFALSE( ih_item_len(ih) <= bytes_or_entries,
-            "vs-10060: no so much bytes %lu (needed %lu)",
-            ( unsigned long )ih_item_len(ih), ( unsigned long )bytes_or_entries);
-    
-    /* change first item key of the DEST */
-    if ( is_direct_le_ih (dih) ) {
-      RFALSE( le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries,
-	      "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries);
-      set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries);
-    } else {
-      RFALSE( le_ih_k_offset (dih) <=
-              (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
-              "vs-10080: dih %h, bytes_or_entries(%d)",
-              dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size);
-      set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size));
-    }
-  }
-  
-  leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih_item_len(ih) - bytes_or_entries, 0);
-  return 1;
-}
 
+		/* merge first item (or its part) of src buffer with the last
+		   item of dest buffer. Both are of the same file */
+		leaf_paste_in_buffer(dest_bi,
+				     dest_nr_item - 1, ih_item_len(dih),
+				     bytes_or_entries, B_I_PITEM(src, ih), 0);
+
+		if (is_indirect_le_ih(dih)) {
+			RFALSE(get_ih_free_space(dih),
+			       "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space",
+			       ih);
+			if (bytes_or_entries == ih_item_len(ih))
+				set_ih_free_space(dih, get_ih_free_space(ih));
+		}
+
+		return 1;
+	}
+
+	/* copy boundary item to right (last_first == LAST_TO_FIRST) */
+
+	/* ( DEST is empty or last item of SOURCE and first item of DEST
+	   are the items of different object or of different types )
+	 */
+	src_nr_item = B_NR_ITEMS(src);
+	ih = B_N_PITEM_HEAD(src, src_nr_item - 1);
+	dih = B_N_PITEM_HEAD(dest, 0);
+
+	if (!dest_nr_item || !op_is_left_mergeable(&(dih->ih_key), src->b_size))
+		return 0;
+
+	if (is_direntry_le_ih(ih)) {
+		if (bytes_or_entries == -1)
+			/* bytes_or_entries = entries number in last item body of SOURCE */
+			bytes_or_entries = ih_entry_count(ih);
+
+		leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+				      src_nr_item - 1,
+				      ih_entry_count(ih) - bytes_or_entries,
+				      bytes_or_entries);
+		return 1;
+	}
+
+	/* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST;
+	   part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST;
+	   don't create new item header
+	 */
+
+	RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih),
+	       "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)",
+	       ih);
+
+	if (bytes_or_entries == -1) {
+		/* bytes_or_entries = length of last item body of SOURCE */
+		bytes_or_entries = ih_item_len(ih);
+
+		RFALSE(le_ih_k_offset(dih) !=
+		       le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size),
+		       "vs-10050: items %h and %h do not match", ih, dih);
+
+		/* change first item key of the DEST */
+		set_le_ih_k_offset(dih, le_ih_k_offset(ih));
+
+		/* item becomes non-mergeable */
+		/* or mergeable if left item was */
+		set_le_ih_k_type(dih, le_ih_k_type(ih));
+	} else {
+		/* merge to right only part of item */
+		RFALSE(ih_item_len(ih) <= bytes_or_entries,
+		       "vs-10060: no so much bytes %lu (needed %lu)",
+		       (unsigned long)ih_item_len(ih),
+		       (unsigned long)bytes_or_entries);
+
+		/* change first item key of the DEST */
+		if (is_direct_le_ih(dih)) {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (unsigned long)bytes_or_entries,
+			       "vs-10070: dih %h, bytes_or_entries(%d)", dih,
+			       bytes_or_entries);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   bytes_or_entries);
+		} else {
+			RFALSE(le_ih_k_offset(dih) <=
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size,
+			       "vs-10080: dih %h, bytes_or_entries(%d)",
+			       dih,
+			       (bytes_or_entries / UNFM_P_SIZE) * dest->b_size);
+			set_le_ih_k_offset(dih,
+					   le_ih_k_offset(dih) -
+					   ((bytes_or_entries / UNFM_P_SIZE) *
+					    dest->b_size));
+		}
+	}
+
+	leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries,
+			     B_I_PITEM(src,
+				       ih) + ih_item_len(ih) - bytes_or_entries,
+			     0);
+	return 1;
+}
 
 /* copy cpy_mun items from buffer src to buffer dest
  * last_first == FIRST_TO_LAST means, that we copy cpy_num  items beginning from first-th item in src to tail of dest
  * last_first == LAST_TO_FIRST means, that we copy cpy_num  items beginning from first-th item in src to head of dest
  */
-static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
-				      int first, int cpy_num)
+static void leaf_copy_items_entirely(struct buffer_info *dest_bi,
+				     struct buffer_head *src, int last_first,
+				     int first, int cpy_num)
 {
-    struct buffer_head * dest;
-    int nr, free_space;
-    int dest_before;
-    int last_loc, last_inserted_loc, location;
-    int i, j;
-    struct block_head * blkh;
-    struct item_head * ih;
-
-    RFALSE( last_first != LAST_TO_FIRST  && last_first != FIRST_TO_LAST,
-	    "vs-10090: bad last_first parameter %d", last_first);
-    RFALSE( B_NR_ITEMS (src) - first < cpy_num,
-	    "vs-10100: too few items in source %d, required %d from %d",
-	    B_NR_ITEMS(src), cpy_num, first);
-    RFALSE( cpy_num < 0, "vs-10110: can not copy negative amount of items");
-    RFALSE( ! dest_bi, "vs-10120: can not copy negative amount of items");
-
-    dest = dest_bi->bi_bh;
-
-    RFALSE( ! dest, "vs-10130: can not copy negative amount of items");
-
-    if (cpy_num == 0)
-	return;
-
-    blkh = B_BLK_HEAD(dest);
-    nr = blkh_nr_item( blkh );
-    free_space = blkh_free_space(blkh);
-  
-    /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
-    dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
-
-    /* location of head of first new item */
-    ih = B_N_PITEM_HEAD (dest, dest_before);
-
-    RFALSE( blkh_free_space(blkh) < cpy_num * IH_SIZE,
-            "vs-10140: not enough free space for headers %d (needed %d)",
-            B_FREE_SPACE (dest), cpy_num * IH_SIZE);
-
-    /* prepare space for headers */
-    memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE);
-
-    /* copy item headers */
-    memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE);
-
-    free_space -= (IH_SIZE * cpy_num);
-    set_blkh_free_space( blkh, free_space );
-
-    /* location of unmovable item */
-    j = location = (dest_before == 0) ? dest->b_size : ih_location(ih-1);
-    for (i = dest_before; i < nr + cpy_num; i ++) {
-        location -= ih_item_len( ih + i - dest_before );
-        put_ih_location( ih + i - dest_before, location );
-    }
-
-    /* prepare space for items */
-    last_loc = ih_location( &(ih[nr+cpy_num-1-dest_before]) );
-    last_inserted_loc = ih_location( &(ih[cpy_num-1]) );
-
-    /* check free space */
-    RFALSE( free_space < j - last_inserted_loc,
-	    "vs-10150: not enough free space for items %d (needed %d)",
-            free_space, j - last_inserted_loc);
-
-    memmove (dest->b_data + last_loc,
-	     dest->b_data + last_loc + j - last_inserted_loc,
-	     last_inserted_loc - last_loc);
-
-    /* copy items */
-    memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)),
-	    j - last_inserted_loc);
-
-    /* sizes, item number */
-    set_blkh_nr_item( blkh, nr + cpy_num );
-    set_blkh_free_space( blkh, free_space - (j - last_inserted_loc) );
-
-    do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0);
-
-    if (dest_bi->bi_parent) {
-	struct disk_child *t_dc;
-	t_dc = B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position);
-	RFALSE( dc_block_number(t_dc) != dest->b_blocknr,
-	        "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
-                ( long unsigned ) dest->b_blocknr, 
-		( long unsigned ) dc_block_number(t_dc));
-	put_dc_size( t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num ) );
-    
-	do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0);
-    }
-}
+	struct buffer_head *dest;
+	int nr, free_space;
+	int dest_before;
+	int last_loc, last_inserted_loc, location;
+	int i, j;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST,
+	       "vs-10090: bad last_first parameter %d", last_first);
+	RFALSE(B_NR_ITEMS(src) - first < cpy_num,
+	       "vs-10100: too few items in source %d, required %d from %d",
+	       B_NR_ITEMS(src), cpy_num, first);
+	RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items");
+	RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items");
+
+	dest = dest_bi->bi_bh;
+
+	RFALSE(!dest, "vs-10130: can not copy negative amount of items");
+
+	if (cpy_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(dest);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */
+	dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr;
+
+	/* location of head of first new item */
+	ih = B_N_PITEM_HEAD(dest, dest_before);
+
+	RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE,
+	       "vs-10140: not enough free space for headers %d (needed %d)",
+	       B_FREE_SPACE(dest), cpy_num * IH_SIZE);
+
+	/* prepare space for headers */
+	memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE);
 
+	/* copy item headers */
+	memcpy(ih, B_N_PITEM_HEAD(src, first), cpy_num * IH_SIZE);
+
+	free_space -= (IH_SIZE * cpy_num);
+	set_blkh_free_space(blkh, free_space);
+
+	/* location of unmovable item */
+	j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1);
+	for (i = dest_before; i < nr + cpy_num; i++) {
+		location -= ih_item_len(ih + i - dest_before);
+		put_ih_location(ih + i - dest_before, location);
+	}
+
+	/* prepare space for items */
+	last_loc = ih_location(&(ih[nr + cpy_num - 1 - dest_before]));
+	last_inserted_loc = ih_location(&(ih[cpy_num - 1]));
+
+	/* check free space */
+	RFALSE(free_space < j - last_inserted_loc,
+	       "vs-10150: not enough free space for items %d (needed %d)",
+	       free_space, j - last_inserted_loc);
+
+	memmove(dest->b_data + last_loc,
+		dest->b_data + last_loc + j - last_inserted_loc,
+		last_inserted_loc - last_loc);
+
+	/* copy items */
+	memcpy(dest->b_data + last_inserted_loc,
+	       B_N_PITEM(src, (first + cpy_num - 1)), j - last_inserted_loc);
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, nr + cpy_num);
+	set_blkh_free_space(blkh, free_space - (j - last_inserted_loc));
+
+	do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0);
+
+	if (dest_bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position);
+		RFALSE(dc_block_number(t_dc) != dest->b_blocknr,
+		       "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu",
+		       (long unsigned)dest->b_blocknr,
+		       (long unsigned)dc_block_number(t_dc));
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (j - last_inserted_loc +
+					     IH_SIZE * cpy_num));
+
+		do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent,
+					       0);
+	}
+}
 
 /* This function splits the (liquid) item into two items (useful when
    shifting part of an item into another node.) */
-static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first,
-			      int item_num, int cpy_bytes)
+static void leaf_item_bottle(struct buffer_info *dest_bi,
+			     struct buffer_head *src, int last_first,
+			     int item_num, int cpy_bytes)
 {
-    struct buffer_head * dest = dest_bi->bi_bh;
-    struct item_head * ih;
-  
-    RFALSE( cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item");
-
-    if ( last_first == FIRST_TO_LAST ) {
-	/* if ( if item in position item_num in buffer SOURCE is directory item ) */
-	if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num)))
-	    leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes);
-	else {
-	    struct item_head n_ih;
-      
-	    /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST 
-	       part defined by 'cpy_bytes'; create new item header; change old item_header (????);
-	       n_ih = new item_header;
-	    */
-	    memcpy (&n_ih, ih, IH_SIZE);
-	    put_ih_item_len( &n_ih, cpy_bytes );
-	    if (is_indirect_le_ih (ih)) {
-		RFALSE( cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih),
-		        "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
-                        ( long unsigned ) get_ih_free_space (ih));
-		set_ih_free_space (&n_ih, 0);
-	    }
-
-	    RFALSE( op_is_left_mergeable (&(ih->ih_key), src->b_size),
-		    "vs-10190: bad mergeability of item %h", ih);
-	    n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
-	    leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0);
+	struct buffer_head *dest = dest_bi->bi_bh;
+	struct item_head *ih;
+
+	RFALSE(cpy_bytes == -1,
+	       "vs-10170: bytes == - 1 means: do not split item");
+
+	if (last_first == FIRST_TO_LAST) {
+		/* if ( if item in position item_num in buffer SOURCE is directory item ) */
+		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+			leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST,
+					      item_num, 0, cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST 
+			   part defined by 'cpy_bytes'; create new item header; change old item_header (????);
+			   n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, IH_SIZE);
+			put_ih_item_len(&n_ih, cpy_bytes);
+			if (is_indirect_le_ih(ih)) {
+				RFALSE(cpy_bytes == ih_item_len(ih)
+				       && get_ih_free_space(ih),
+				       "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)",
+				       (long unsigned)get_ih_free_space(ih));
+				set_ih_free_space(&n_ih, 0);
+			}
+
+			RFALSE(op_is_left_mergeable(&(ih->ih_key), src->b_size),
+			       "vs-10190: bad mergeability of item %h", ih);
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+			leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih,
+					     B_N_PITEM(src, item_num), 0);
+		}
+	} else {
+		/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
+		if (is_direntry_le_ih(ih = B_N_PITEM_HEAD(src, item_num)))
+			leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST,
+					      item_num,
+					      I_ENTRY_COUNT(ih) - cpy_bytes,
+					      cpy_bytes);
+		else {
+			struct item_head n_ih;
+
+			/* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST 
+			   part defined by 'cpy_bytes'; create new item header;
+			   n_ih = new item_header;
+			 */
+			memcpy(&n_ih, ih, SHORT_KEY_SIZE);
+
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+
+			if (is_direct_le_ih(ih)) {
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   ih_item_len(ih) - cpy_bytes);
+				set_le_ih_k_type(&n_ih, TYPE_DIRECT);
+				set_ih_free_space(&n_ih, MAX_US_INT);
+			} else {
+				/* indirect item */
+				RFALSE(!cpy_bytes && get_ih_free_space(ih),
+				       "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
+				set_le_ih_k_offset(&n_ih,
+						   le_ih_k_offset(ih) +
+						   (ih_item_len(ih) -
+						    cpy_bytes) / UNFM_P_SIZE *
+						   dest->b_size);
+				set_le_ih_k_type(&n_ih, TYPE_INDIRECT);
+				set_ih_free_space(&n_ih, get_ih_free_space(ih));
+			}
+
+			/* set item length */
+			put_ih_item_len(&n_ih, cpy_bytes);
+
+			n_ih.ih_version = ih->ih_version;	/* JDM Endian safe, both le */
+
+			leaf_insert_into_buf(dest_bi, 0, &n_ih,
+					     B_N_PITEM(src,
+						       item_num) +
+					     ih_item_len(ih) - cpy_bytes, 0);
+		}
 	}
-    } else {
-	/*  if ( if item in position item_num in buffer SOURCE is directory item ) */
-	if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num)))
-	    leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes);
-	else {
-	    struct item_head n_ih;
-      
-	    /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST 
-	       part defined by 'cpy_bytes'; create new item header;
-	       n_ih = new item_header;
-	    */
-	    memcpy (&n_ih, ih, SHORT_KEY_SIZE);
-
-	    n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
-
-	    if (is_direct_le_ih (ih)) {
-		set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + ih_item_len(ih) - cpy_bytes);
-		set_le_ih_k_type (&n_ih, TYPE_DIRECT);
-		set_ih_free_space (&n_ih, MAX_US_INT);
-	    } else {
-		/* indirect item */
-		RFALSE( !cpy_bytes && get_ih_free_space (ih),
-		        "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended");
-		set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size);
-		set_le_ih_k_type (&n_ih, TYPE_INDIRECT);
-		set_ih_free_space (&n_ih, get_ih_free_space (ih));
-	    }
-      
-	    /* set item length */
-	    put_ih_item_len( &n_ih, cpy_bytes );
-
-	    n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */
-
-	    leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + ih_item_len(ih) - cpy_bytes, 0);
-	}
-    }
 }
 
-
 /* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST.
    If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST.
    From last item copy cpy_num bytes for regular item and cpy_num directory entries for
    directory item. */
-static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num,
-			    int cpy_bytes)
+static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src,
+			   int last_first, int cpy_num, int cpy_bytes)
 {
-  struct buffer_head * dest;
-  int pos, i, src_nr_item, bytes;
-
-  dest = dest_bi->bi_bh;
-  RFALSE( !dest || !src, "vs-10210: !dest || !src");
-  RFALSE( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
-	  "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
-  RFALSE( B_NR_ITEMS(src) < cpy_num,
-	  "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num);
-  RFALSE( cpy_num < 0,"vs-10240: cpy_num < 0 (%d)", cpy_num);
-
- if ( cpy_num == 0 )
-   return 0;
- 
- if ( last_first == FIRST_TO_LAST ) {
-   /* copy items to left */
-   pos = 0;
-   if ( cpy_num == 1 )
-     bytes = cpy_bytes;
-   else
-     bytes = -1;
-   
-   /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
-   i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes);
-   cpy_num -= i;
-   if ( cpy_num == 0 )
-     return i;
-   pos += i;
-   if ( cpy_bytes == -1 )
-     /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
-     leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num);
-   else {
-     /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
-     leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1);
-	     
-     /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
-     leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes);
-   } 
- } else {
-   /* copy items to right */
-   src_nr_item = B_NR_ITEMS (src);
-   if ( cpy_num == 1 )
-     bytes = cpy_bytes;
-   else
-     bytes = -1;
-   
-   /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
-   i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes);
-   
-   cpy_num -= i;
-   if ( cpy_num == 0 )
-     return i;
-   
-   pos = src_nr_item - cpy_num - i;
-   if ( cpy_bytes == -1 ) {
-     /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
-     leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num);
-   } else {
-     /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
-     leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1);
-
-     /* copy part of the item which number is pos to the begin of the DEST */
-     leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes);
-   }
- }
- return i;
+	struct buffer_head *dest;
+	int pos, i, src_nr_item, bytes;
+
+	dest = dest_bi->bi_bh;
+	RFALSE(!dest || !src, "vs-10210: !dest || !src");
+	RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST,
+	       "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST");
+	RFALSE(B_NR_ITEMS(src) < cpy_num,
+	       "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src),
+	       cpy_num);
+	RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num);
+
+	if (cpy_num == 0)
+		return 0;
+
+	if (last_first == FIRST_TO_LAST) {
+		/* copy items to left */
+		pos = 0;
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */
+		i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes);
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+		pos += i;
+		if (cpy_bytes == -1)
+			/* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num);
+		else {
+			/* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */
+			leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST,
+						 pos, cpy_num - 1);
+
+			/* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */
+			leaf_item_bottle(dest_bi, src, FIRST_TO_LAST,
+					 cpy_num + pos - 1, cpy_bytes);
+		}
+	} else {
+		/* copy items to right */
+		src_nr_item = B_NR_ITEMS(src);
+		if (cpy_num == 1)
+			bytes = cpy_bytes;
+		else
+			bytes = -1;
+
+		/* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */
+		i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes);
+
+		cpy_num -= i;
+		if (cpy_num == 0)
+			return i;
+
+		pos = src_nr_item - cpy_num - i;
+		if (cpy_bytes == -1) {
+			/* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos, cpy_num);
+		} else {
+			/* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */
+			leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST,
+						 pos + 1, cpy_num - 1);
+
+			/* copy part of the item which number is pos to the begin of the DEST */
+			leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos,
+					 cpy_bytes);
+		}
+	}
+	return i;
 }
 
-
 /* there are types of coping: from S[0] to L[0], from S[0] to R[0],
    from R[0] to L[0]. for each of these we have to define parent and
    positions of destination and source buffers */
-static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi,
-					struct buffer_info * src_bi, int * first_last,
-					struct buffer_head * Snew)
+static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb,
+				       struct buffer_info *dest_bi,
+				       struct buffer_info *src_bi,
+				       int *first_last,
+				       struct buffer_head *Snew)
 {
-    memset (dest_bi, 0, sizeof (struct buffer_info));
-    memset (src_bi, 0, sizeof (struct buffer_info));
-
-    /* define dest, src, dest parent, dest position */
-    switch (shift_mode) {
-    case LEAF_FROM_S_TO_L:    /* it is used in leaf_shift_left */
-	src_bi->tb = tb;
-	src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
-	src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);	/* src->b_item_order */
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->L[0];
-	dest_bi->bi_parent = tb->FL[0];
-	dest_bi->bi_position = get_left_neighbor_position (tb, 0);
-	*first_last = FIRST_TO_LAST;
-	break;
-
-    case LEAF_FROM_S_TO_R:  /* it is used in leaf_shift_right */
-	src_bi->tb = tb;
-	src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
-	src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->R[0];
-	dest_bi->bi_parent = tb->FR[0];
-	dest_bi->bi_position = get_right_neighbor_position (tb, 0);
-	*first_last = LAST_TO_FIRST;
-	break;
-
-    case LEAF_FROM_R_TO_L:  /* it is used in balance_leaf_when_delete */
-	src_bi->tb = tb;
-	src_bi->bi_bh = tb->R[0];
-	src_bi->bi_parent = tb->FR[0];
-	src_bi->bi_position = get_right_neighbor_position (tb, 0);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->L[0];
-	dest_bi->bi_parent = tb->FL[0];
-	dest_bi->bi_position = get_left_neighbor_position (tb, 0);
-	*first_last = FIRST_TO_LAST;
-	break;
-    
-    case LEAF_FROM_L_TO_R:  /* it is used in balance_leaf_when_delete */
-	src_bi->tb = tb;
-	src_bi->bi_bh = tb->L[0];
-	src_bi->bi_parent = tb->FL[0];
-	src_bi->bi_position = get_left_neighbor_position (tb, 0);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = tb->R[0];
-	dest_bi->bi_parent = tb->FR[0];
-	dest_bi->bi_position = get_right_neighbor_position (tb, 0);
-	*first_last = LAST_TO_FIRST;
-	break;
-
-    case LEAF_FROM_S_TO_SNEW:
-	src_bi->tb = tb;
-	src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path);
-	src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0);
-	src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0);
-	dest_bi->tb = tb;
-	dest_bi->bi_bh = Snew;
-	dest_bi->bi_parent = NULL;
-	dest_bi->bi_position = 0;
-	*first_last = LAST_TO_FIRST;
-	break;
-    
-    default:
-	reiserfs_panic (NULL, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode);
-    }
-    RFALSE( src_bi->bi_bh == 0 || dest_bi->bi_bh == 0,
-	    "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
-	    shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
+	memset(dest_bi, 0, sizeof(struct buffer_info));
+	memset(src_bi, 0, sizeof(struct buffer_info));
+
+	/* define dest, src, dest parent, dest position */
+	switch (shift_mode) {
+	case LEAF_FROM_S_TO_L:	/* it is used in leaf_shift_left */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);	/* src->b_item_order */
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_S_TO_R:	/* it is used in leaf_shift_right */
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_R_TO_L:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->R[0];
+		src_bi->bi_parent = tb->FR[0];
+		src_bi->bi_position = get_right_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->L[0];
+		dest_bi->bi_parent = tb->FL[0];
+		dest_bi->bi_position = get_left_neighbor_position(tb, 0);
+		*first_last = FIRST_TO_LAST;
+		break;
+
+	case LEAF_FROM_L_TO_R:	/* it is used in balance_leaf_when_delete */
+		src_bi->tb = tb;
+		src_bi->bi_bh = tb->L[0];
+		src_bi->bi_parent = tb->FL[0];
+		src_bi->bi_position = get_left_neighbor_position(tb, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = tb->R[0];
+		dest_bi->bi_parent = tb->FR[0];
+		dest_bi->bi_position = get_right_neighbor_position(tb, 0);
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	case LEAF_FROM_S_TO_SNEW:
+		src_bi->tb = tb;
+		src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path);
+		src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0);
+		src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0);
+		dest_bi->tb = tb;
+		dest_bi->bi_bh = Snew;
+		dest_bi->bi_parent = NULL;
+		dest_bi->bi_position = 0;
+		*first_last = LAST_TO_FIRST;
+		break;
+
+	default:
+		reiserfs_panic(NULL,
+			       "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)",
+			       shift_mode);
+	}
+	RFALSE(src_bi->bi_bh == 0 || dest_bi->bi_bh == 0,
+	       "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly",
+	       shift_mode, src_bi->bi_bh, dest_bi->bi_bh);
 }
 
-
-
-
 /* copy mov_num items and mov_bytes of the (mov_num-1)th item to
    neighbor. Delete them from source */
-int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew)
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+		    int mov_bytes, struct buffer_head *Snew)
 {
-  int ret_value;
-  struct buffer_info dest_bi, src_bi;
-  int first_last;
+	int ret_value;
+	struct buffer_info dest_bi, src_bi;
+	int first_last;
 
-  leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew);
+	leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi,
+				   &first_last, Snew);
 
-  ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes);
+	ret_value =
+	    leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num,
+			    mov_bytes);
 
-  leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes);
+	leaf_delete_items(&src_bi, first_last,
+			  (first_last ==
+			   FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) -
+						 mov_num), mov_num, mov_bytes);
 
-  
-  return ret_value;
+	return ret_value;
 }
 
-
 /* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1)
    from S[0] to L[0] and replace the delimiting key */
-int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes)
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes)
 {
-  struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
-  int i;
+	struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path);
+	int i;
 
-  /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
-  i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
+	/* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */
+	i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL);
 
-  if ( shift_num ) {
-    if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */
+	if (shift_num) {
+		if (B_NR_ITEMS(S0) == 0) {	/* number of items in S[0] == 0 */
 
-      RFALSE( shift_bytes != -1,
-	      "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", 
-	      shift_bytes);
+			RFALSE(shift_bytes != -1,
+			       "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)",
+			       shift_bytes);
 #ifdef CONFIG_REISERFS_CHECK
-      if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
-	print_cur_tb ("vs-10275");
-	reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode);
-      }
+			if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) {
+				print_cur_tb("vs-10275");
+				reiserfs_panic(tb->tb_sb,
+					       "vs-10275: leaf_shift_left: balance condition corrupted (%c)",
+					       tb->tb_mode);
+			}
 #endif
 
-      if (PATH_H_POSITION (tb->tb_path, 1) == 0)
-	replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0);
-
-    } else {     
-      /* replace lkey in CFL[0] by 0-th key from S[0]; */
-      replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0);
-      
-      RFALSE( (shift_bytes != -1 &&
-              !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0))
-                && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) &&
-	      (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)),
-	      "vs-10280: item must be mergeable");
-    }
-  }
-  
-  return i;
-}
-
-
-
+			if (PATH_H_POSITION(tb->tb_path, 1) == 0)
+				replace_key(tb, tb->CFL[0], tb->lkey[0],
+					    PATH_H_PPARENT(tb->tb_path, 0), 0);
+
+		} else {
+			/* replace lkey in CFL[0] by 0-th key from S[0]; */
+			replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0);
+
+			RFALSE((shift_bytes != -1 &&
+				!(is_direntry_le_ih(B_N_PITEM_HEAD(S0, 0))
+				  && !I_ENTRY_COUNT(B_N_PITEM_HEAD(S0, 0)))) &&
+			       (!op_is_left_mergeable
+				(B_N_PKEY(S0, 0), S0->b_size)),
+			       "vs-10280: item must be mergeable");
+		}
+	}
 
+	return i;
+}
 
 /* CLEANING STOPPED HERE */
 
-
-
-
 /* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */
-int	leaf_shift_right(
-		struct tree_balance * tb, 
-		int shift_num,
-		int shift_bytes
-	)
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes)
 {
-  //  struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
-  int ret_value;
+	//  struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path);
+	int ret_value;
 
-  /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
-  ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
+	/* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */
+	ret_value =
+	    leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL);
 
-  /* replace rkey in CFR[0] by the 0-th key from R[0] */
-  if (shift_num) {
-    replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
+	/* replace rkey in CFR[0] by the 0-th key from R[0] */
+	if (shift_num) {
+		replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0);
 
-  }
+	}
 
-  return ret_value;
+	return ret_value;
 }
 
-
-
-static void leaf_delete_items_entirely (struct buffer_info * bi,
-					int first, int del_num);
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num);
 /*  If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR.
     If not. 
     If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of
@@ -670,287 +731,292 @@ static void leaf_delete_items_entirely (struct buffer_info * bi,
     If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of
     the last item . Part defined by del_bytes. Don't delete last item header.
 */
-void leaf_delete_items (struct buffer_info * cur_bi, int last_first, 
-			int first, int del_num, int del_bytes)
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first,
+		       int first, int del_num, int del_bytes)
 {
-    struct buffer_head * bh;
-    int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh);
-
-    RFALSE( !bh, "10155: bh is not defined");
-    RFALSE( del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num);
-    RFALSE( first < 0 || first + del_num > item_amount,
-	    "10165: invalid number of first item to be deleted (%d) or "
-	    "no so much items (%d) to delete (only %d)", 
-	    first, first + del_num, item_amount);
-
-    if ( del_num == 0 )
-	return;
-
-    if ( first == 0 && del_num == item_amount && del_bytes == -1 ) {
-	make_empty_node (cur_bi);
-	do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0);
-	return;
-    }
-
-    if ( del_bytes == -1 )
-	/* delete del_num items beginning from item in position first */
-	leaf_delete_items_entirely (cur_bi, first, del_num);
-    else {
-	if ( last_first == FIRST_TO_LAST ) {
-	    /* delete del_num-1 items beginning from item in position first  */
-	    leaf_delete_items_entirely (cur_bi, first, del_num-1);
-
-	    /* delete the part of the first item of the bh
-	       do not delete item header
-	    */
-	    leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes);
-	} else  {
-	    struct item_head * ih;
-	    int len;
-
-	    /* delete del_num-1 items beginning from item in position first+1  */
-	    leaf_delete_items_entirely (cur_bi, first+1, del_num-1);
-
-	    if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) 	/* the last item is directory  */
-	        /* len = numbers of directory entries in this item */
-		len = ih_entry_count(ih);
-	    else
-	        /* len = body len of item */
-		len = ih_item_len(ih);
-
-	    /* delete the part of the last item of the bh 
-	       do not delete item header
-	    */
-	    leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes);
+	struct buffer_head *bh;
+	int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh);
+
+	RFALSE(!bh, "10155: bh is not defined");
+	RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d",
+	       del_num);
+	RFALSE(first < 0
+	       || first + del_num > item_amount,
+	       "10165: invalid number of first item to be deleted (%d) or "
+	       "no so much items (%d) to delete (only %d)", first,
+	       first + del_num, item_amount);
+
+	if (del_num == 0)
+		return;
+
+	if (first == 0 && del_num == item_amount && del_bytes == -1) {
+		make_empty_node(cur_bi);
+		do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0);
+		return;
 	}
-    }
-}
 
+	if (del_bytes == -1)
+		/* delete del_num items beginning from item in position first */
+		leaf_delete_items_entirely(cur_bi, first, del_num);
+	else {
+		if (last_first == FIRST_TO_LAST) {
+			/* delete del_num-1 items beginning from item in position first  */
+			leaf_delete_items_entirely(cur_bi, first, del_num - 1);
+
+			/* delete the part of the first item of the bh
+			   do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes);
+		} else {
+			struct item_head *ih;
+			int len;
+
+			/* delete del_num-1 items beginning from item in position first+1  */
+			leaf_delete_items_entirely(cur_bi, first + 1,
+						   del_num - 1);
+
+			if (is_direntry_le_ih
+			    (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh) - 1)))
+				/* the last item is directory  */
+				/* len = numbers of directory entries in this item */
+				len = ih_entry_count(ih);
+			else
+				/* len = body len of item */
+				len = ih_item_len(ih);
+
+			/* delete the part of the last item of the bh 
+			   do not delete item header
+			 */
+			leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1,
+					     len - del_bytes, del_bytes);
+		}
+	}
+}
 
 /* insert item into the leaf node in position before */
-void leaf_insert_into_buf (struct buffer_info * bi, int before,
-			   struct item_head * inserted_item_ih,
-			   const char * inserted_item_body,
-			   int zeros_number)
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+			  struct item_head *inserted_item_ih,
+			  const char *inserted_item_body, int zeros_number)
 {
-    struct buffer_head * bh = bi->bi_bh;
-    int nr, free_space;
-    struct block_head * blkh;
-    struct item_head * ih;
-    int i;
-    int last_loc, unmoved_loc;
-    char * to;
-
-
-    blkh = B_BLK_HEAD(bh);
-    nr = blkh_nr_item(blkh);
-    free_space = blkh_free_space( blkh );
-
-    /* check free space */
-    RFALSE( free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
-            "vs-10170: not enough free space in block %z, new item %h",
-            bh, inserted_item_ih);
-    RFALSE( zeros_number > ih_item_len(inserted_item_ih),
-	    "vs-10172: zero number == %d, item length == %d",
-            zeros_number, ih_item_len(inserted_item_ih));
-
-
-    /* get item new item must be inserted before */
-    ih = B_N_PITEM_HEAD (bh, before);
-
-    /* prepare space for the body of new item */
-    last_loc = nr ? ih_location( &(ih[nr - before - 1]) ) : bh->b_size;
-    unmoved_loc = before ? ih_location( ih-1 ) : bh->b_size;
-
-
-    memmove (bh->b_data + last_loc - ih_item_len(inserted_item_ih), 
-	     bh->b_data + last_loc, unmoved_loc - last_loc);
-
-    to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
-    memset (to, 0, zeros_number);
-    to += zeros_number;
-
-    /* copy body to prepared space */
-    if (inserted_item_body)
-	memmove (to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number);
-    else
-	memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
-  
-    /* insert item header */
-    memmove (ih + 1, ih, IH_SIZE * (nr - before));
-    memmove (ih, inserted_item_ih, IH_SIZE);
-  
-    /* change locations */
-    for (i = before; i < nr + 1; i ++)
-    {
-        unmoved_loc -= ih_item_len( &(ih[i-before]));
-	put_ih_location( &(ih[i-before]), unmoved_loc );
-    }
-  
-    /* sizes, free space, item number */
-    set_blkh_nr_item( blkh, blkh_nr_item(blkh) + 1 );
-    set_blkh_free_space( blkh,
-                    free_space - (IH_SIZE + ih_item_len(inserted_item_ih ) ) );
-    do_balance_mark_leaf_dirty (bi->tb, bh, 1);
-
-    if (bi->bi_parent) { 
-	struct disk_child *t_dc;
-	t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
-	put_dc_size( t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih)));
-	do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
-    }
-}
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+	char *to;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE,
+	       "vs-10170: not enough free space in block %z, new item %h",
+	       bh, inserted_item_ih);
+	RFALSE(zeros_number > ih_item_len(inserted_item_ih),
+	       "vs-10172: zero number == %d, item length == %d",
+	       zeros_number, ih_item_len(inserted_item_ih));
+
+	/* get item new item must be inserted before */
+	ih = B_N_PITEM_HEAD(bh, before);
+
+	/* prepare space for the body of new item */
+	last_loc = nr ? ih_location(&(ih[nr - before - 1])) : bh->b_size;
+	unmoved_loc = before ? ih_location(ih - 1) : bh->b_size;
+
+	memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih),
+		bh->b_data + last_loc, unmoved_loc - last_loc);
+
+	to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih);
+	memset(to, 0, zeros_number);
+	to += zeros_number;
+
+	/* copy body to prepared space */
+	if (inserted_item_body)
+		memmove(to, inserted_item_body,
+			ih_item_len(inserted_item_ih) - zeros_number);
+	else
+		memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number);
+
+	/* insert item header */
+	memmove(ih + 1, ih, IH_SIZE * (nr - before));
+	memmove(ih, inserted_item_ih, IH_SIZE);
+
+	/* change locations */
+	for (i = before; i < nr + 1; i++) {
+		unmoved_loc -= ih_item_len(&(ih[i - before]));
+		put_ih_location(&(ih[i - before]), unmoved_loc);
+	}
 
+	/* sizes, free space, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1);
+	set_blkh_free_space(blkh,
+			    free_space - (IH_SIZE +
+					  ih_item_len(inserted_item_ih)));
+	do_balance_mark_leaf_dirty(bi->tb, bh, 1);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) + (IH_SIZE +
+					     ih_item_len(inserted_item_ih)));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
 
 /* paste paste_size bytes to affected_item_num-th item. 
    When item is a directory, this only prepare space for new entries */
-void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num,
-			   int pos_in_item, int paste_size,
-			   const char * body,
-			   int zeros_number)
+void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num,
+			  int pos_in_item, int paste_size,
+			  const char *body, int zeros_number)
 {
-    struct buffer_head * bh = bi->bi_bh;
-    int nr, free_space;
-    struct block_head * blkh;
-    struct item_head * ih;
-    int i;
-    int last_loc, unmoved_loc;
-
-    blkh = B_BLK_HEAD(bh);
-    nr = blkh_nr_item(blkh);
-    free_space = blkh_free_space(blkh);
-
-
-    /* check free space */
-    RFALSE( free_space < paste_size,
-            "vs-10175: not enough free space: needed %d, available %d",
-            paste_size, free_space);
+	struct buffer_head *bh = bi->bi_bh;
+	int nr, free_space;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i;
+	int last_loc, unmoved_loc;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	free_space = blkh_free_space(blkh);
+
+	/* check free space */
+	RFALSE(free_space < paste_size,
+	       "vs-10175: not enough free space: needed %d, available %d",
+	       paste_size, free_space);
 
 #ifdef CONFIG_REISERFS_CHECK
-    if (zeros_number > paste_size) {
-	print_cur_tb ("10177");
-	reiserfs_panic ( NULL, "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
-                         zeros_number, paste_size);
-    }
-#endif /* CONFIG_REISERFS_CHECK */
-
-
-    /* item to be appended */
-    ih = B_N_PITEM_HEAD(bh, affected_item_num);
-
-    last_loc = ih_location( &(ih[nr - affected_item_num - 1]) );
-    unmoved_loc = affected_item_num ? ih_location( ih-1 ) : bh->b_size;
-
-    /* prepare space */
-    memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
-	     unmoved_loc - last_loc);
-
-
-    /* change locations */
-    for (i = affected_item_num; i < nr; i ++)
-	put_ih_location( &(ih[i-affected_item_num]),
-                    ih_location( &(ih[i-affected_item_num])) - paste_size );
-
-    if ( body ) {
-	if (!is_direntry_le_ih (ih)) {
-	    if (!pos_in_item) {
-		/* shift data to right */
-		memmove (bh->b_data + ih_location(ih) + paste_size, 
-			 bh->b_data + ih_location(ih), ih_item_len(ih));
-		/* paste data in the head of item */
-		memset (bh->b_data + ih_location(ih), 0, zeros_number);
-		memcpy (bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number);
-	    } else {
-		memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number);
-		memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number);
-	    }
+	if (zeros_number > paste_size) {
+		print_cur_tb("10177");
+		reiserfs_panic(NULL,
+			       "vs-10177: leaf_paste_in_buffer: ero number == %d, paste_size == %d",
+			       zeros_number, paste_size);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/* item to be appended */
+	ih = B_N_PITEM_HEAD(bh, affected_item_num);
+
+	last_loc = ih_location(&(ih[nr - affected_item_num - 1]));
+	unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* prepare space */
+	memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc);
+
+	/* change locations */
+	for (i = affected_item_num; i < nr; i++)
+		put_ih_location(&(ih[i - affected_item_num]),
+				ih_location(&(ih[i - affected_item_num])) -
+				paste_size);
+
+	if (body) {
+		if (!is_direntry_le_ih(ih)) {
+			if (!pos_in_item) {
+				/* shift data to right */
+				memmove(bh->b_data + ih_location(ih) +
+					paste_size,
+					bh->b_data + ih_location(ih),
+					ih_item_len(ih));
+				/* paste data in the head of item */
+				memset(bh->b_data + ih_location(ih), 0,
+				       zeros_number);
+				memcpy(bh->b_data + ih_location(ih) +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			} else {
+				memset(bh->b_data + unmoved_loc - paste_size, 0,
+				       zeros_number);
+				memcpy(bh->b_data + unmoved_loc - paste_size +
+				       zeros_number, body,
+				       paste_size - zeros_number);
+			}
+		}
+	} else
+		memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
+
+	put_ih_item_len(ih, ih_item_len(ih) + paste_size);
+
+	/* change free space */
+	set_blkh_free_space(blkh, free_space - paste_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) + paste_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
 	}
-    }
-    else
-	memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size);
-
-    put_ih_item_len( ih, ih_item_len(ih) + paste_size );
-
-    /* change free space */
-    set_blkh_free_space( blkh, free_space - paste_size );
-
-    do_balance_mark_leaf_dirty (bi->tb, bh, 0);
-
-    if (bi->bi_parent) { 
-	struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
-	put_dc_size( t_dc, dc_size(t_dc) + paste_size );
-	do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
-    }
 }
 
-
 /* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item
    does not have free space, so it moves DEHs and remaining records as
    necessary. Return value is size of removed part of directory item
    in bytes. */
-static int	leaf_cut_entries (
-				struct buffer_head * bh,
-				struct item_head * ih, 
-				int from, 
-				int del_count
-			)
+static int leaf_cut_entries(struct buffer_head *bh,
+			    struct item_head *ih, int from, int del_count)
 {
-  char * item;
-  struct reiserfs_de_head * deh;
-  int prev_record_offset;	/* offset of record, that is (from-1)th */
-  char * prev_record;		/* */
-  int cut_records_len;		/* length of all removed records */
-  int i;
-
-
-  /* make sure, that item is directory and there are enough entries to
-     remove */
-  RFALSE( !is_direntry_le_ih (ih), "10180: item is not directory item");
-  RFALSE( I_ENTRY_COUNT(ih) < from + del_count,
-	  "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
-	  I_ENTRY_COUNT(ih), from, del_count);
-
-  if (del_count == 0)
-    return 0;
-
-  /* first byte of item */
-  item = bh->b_data + ih_location(ih);
-
-  /* entry head array */
-  deh = B_I_DEH (bh, ih);
-
-  /* first byte of remaining entries, those are BEFORE cut entries
-     (prev_record) and length of all removed records (cut_records_len) */
-  prev_record_offset = (from ? deh_location( &(deh[from - 1])) : ih_item_len(ih));
-  cut_records_len = prev_record_offset/*from_record*/ -
-                                deh_location( &(deh[from + del_count - 1]));
-  prev_record = item + prev_record_offset;
-
-
-  /* adjust locations of remaining entries */
-  for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --)
-    put_deh_location( &(deh[i]),
-                        deh_location( &deh[i] ) - (DEH_SIZE * del_count ) );
-
-  for (i = 0; i < from; i ++)
-    put_deh_location( &(deh[i]),
-        deh_location( &deh[i] ) - (DEH_SIZE * del_count + cut_records_len) );
-
-  put_ih_entry_count( ih, ih_entry_count(ih) - del_count );
-
-  /* shift entry head array and entries those are AFTER removed entries */
-  memmove ((char *)(deh + from),
-	   deh + from + del_count, 
-	   prev_record - cut_records_len - (char *)(deh + from + del_count));
-  
-  /* shift records, those are BEFORE removed entries */
-  memmove (prev_record - cut_records_len - DEH_SIZE * del_count,
-	   prev_record, item + ih_item_len(ih) - prev_record);
-
-  return DEH_SIZE * del_count + cut_records_len;
+	char *item;
+	struct reiserfs_de_head *deh;
+	int prev_record_offset;	/* offset of record, that is (from-1)th */
+	char *prev_record;	/* */
+	int cut_records_len;	/* length of all removed records */
+	int i;
+
+	/* make sure, that item is directory and there are enough entries to
+	   remove */
+	RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item");
+	RFALSE(I_ENTRY_COUNT(ih) < from + del_count,
+	       "10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d",
+	       I_ENTRY_COUNT(ih), from, del_count);
+
+	if (del_count == 0)
+		return 0;
+
+	/* first byte of item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/* first byte of remaining entries, those are BEFORE cut entries
+	   (prev_record) and length of all removed records (cut_records_len) */
+	prev_record_offset =
+	    (from ? deh_location(&(deh[from - 1])) : ih_item_len(ih));
+	cut_records_len = prev_record_offset /*from_record */  -
+	    deh_location(&(deh[from + del_count - 1]));
+	prev_record = item + prev_record_offset;
+
+	/* adjust locations of remaining entries */
+	for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i--)
+		put_deh_location(&(deh[i]),
+				 deh_location(&deh[i]) -
+				 (DEH_SIZE * del_count));
+
+	for (i = 0; i < from; i++)
+		put_deh_location(&(deh[i]),
+				 deh_location(&deh[i]) - (DEH_SIZE * del_count +
+							  cut_records_len));
+
+	put_ih_entry_count(ih, ih_entry_count(ih) - del_count);
+
+	/* shift entry head array and entries those are AFTER removed entries */
+	memmove((char *)(deh + from),
+		deh + from + del_count,
+		prev_record - cut_records_len - (char *)(deh + from +
+							 del_count));
+
+	/* shift records, those are BEFORE removed entries */
+	memmove(prev_record - cut_records_len - DEH_SIZE * del_count,
+		prev_record, item + ih_item_len(ih) - prev_record);
+
+	return DEH_SIZE * del_count + cut_records_len;
 }
 
-
 /*  when cut item is part of regular file
         pos_in_item - first byte that must be cut
         cut_size - number of bytes to be cut beginning from pos_in_item
@@ -959,264 +1025,278 @@ static int	leaf_cut_entries (
         pos_in_item - number of first deleted entry
         cut_size - count of deleted entries
     */
-void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num,
-			   int pos_in_item, int cut_size)
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+			  int pos_in_item, int cut_size)
 {
-    int nr;
-    struct buffer_head * bh = bi->bi_bh;
-    struct block_head * blkh;
-    struct item_head * ih;
-    int last_loc, unmoved_loc;
-    int i;
-
-    blkh = B_BLK_HEAD(bh);
-    nr = blkh_nr_item(blkh);
-
-    /* item head of truncated item */
-    ih = B_N_PITEM_HEAD (bh, cut_item_num);
-
-    if (is_direntry_le_ih (ih)) {
-        /* first cut entry ()*/
-        cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size);
-        if (pos_in_item == 0) {
-	        /* change key */
-            RFALSE( cut_item_num,
-                    "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num);
-            /* change item key by key of first entry in the item */
-	    set_le_ih_k_offset (ih, deh_offset(B_I_DEH (bh, ih)));
-            /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/
-	    }
-    } else {
-        /* item is direct or indirect */
-        RFALSE( is_statdata_le_ih (ih), "10195: item is stat data");
-        RFALSE( pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
-                "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
-                ( long unsigned ) pos_in_item, ( long unsigned ) cut_size, 
-		( long unsigned ) ih_item_len (ih));
-
-        /* shift item body to left if cut is from the head of item */
-        if (pos_in_item == 0) {
-            memmove( bh->b_data + ih_location(ih),
-		     bh->b_data + ih_location(ih) + cut_size,
-		     ih_item_len(ih) - cut_size);
-	    
-            /* change key of item */
-            if (is_direct_le_ih (ih))
-		set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size);
-            else {
-		set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size);
-                RFALSE( ih_item_len(ih) == cut_size && get_ih_free_space (ih),
-                        "10205: invalid ih_free_space (%h)", ih);
-	        }
-	    }
-    }
-  
-
-    /* location of the last item */
-    last_loc = ih_location( &(ih[nr - cut_item_num - 1]) );
-
-    /* location of the item, which is remaining at the same place */
-    unmoved_loc = cut_item_num ? ih_location(ih-1) : bh->b_size;
-
-
-    /* shift */
-    memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
-	       unmoved_loc - last_loc - cut_size);
-
-    /* change item length */
-    put_ih_item_len( ih, ih_item_len(ih) - cut_size );
-  
-    if (is_indirect_le_ih (ih)) {
-        if (pos_in_item)
-            set_ih_free_space (ih, 0);
-    }
-
-    /* change locations */
-    for (i = cut_item_num; i < nr; i ++)
-    put_ih_location( &(ih[i-cut_item_num]), ih_location( &ih[i-cut_item_num]) + cut_size );
-
-    /* size, free space */
-    set_blkh_free_space( blkh, blkh_free_space(blkh) + cut_size );
-
-    do_balance_mark_leaf_dirty (bi->tb, bh, 0);
-    
-    if (bi->bi_parent) { 
-      struct disk_child *t_dc;
-      t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
-      put_dc_size( t_dc, dc_size(t_dc) - cut_size );
-      do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
-    }
-}
+	int nr;
+	struct buffer_head *bh = bi->bi_bh;
+	struct block_head *blkh;
+	struct item_head *ih;
+	int last_loc, unmoved_loc;
+	int i;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+
+	/* item head of truncated item */
+	ih = B_N_PITEM_HEAD(bh, cut_item_num);
+
+	if (is_direntry_le_ih(ih)) {
+		/* first cut entry () */
+		cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size);
+		if (pos_in_item == 0) {
+			/* change key */
+			RFALSE(cut_item_num,
+			       "when 0-th enrty of item is cut, that item must be first in the node, not %d-th",
+			       cut_item_num);
+			/* change item key by key of first entry in the item */
+			set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih)));
+			/*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE); */
+		}
+	} else {
+		/* item is direct or indirect */
+		RFALSE(is_statdata_le_ih(ih), "10195: item is stat data");
+		RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih),
+		       "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)",
+		       (long unsigned)pos_in_item, (long unsigned)cut_size,
+		       (long unsigned)ih_item_len(ih));
+
+		/* shift item body to left if cut is from the head of item */
+		if (pos_in_item == 0) {
+			memmove(bh->b_data + ih_location(ih),
+				bh->b_data + ih_location(ih) + cut_size,
+				ih_item_len(ih) - cut_size);
+
+			/* change key of item */
+			if (is_direct_le_ih(ih))
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   cut_size);
+			else {
+				set_le_ih_k_offset(ih,
+						   le_ih_k_offset(ih) +
+						   (cut_size / UNFM_P_SIZE) *
+						   bh->b_size);
+				RFALSE(ih_item_len(ih) == cut_size
+				       && get_ih_free_space(ih),
+				       "10205: invalid ih_free_space (%h)", ih);
+			}
+		}
+	}
+
+	/* location of the last item */
+	last_loc = ih_location(&(ih[nr - cut_item_num - 1]));
+
+	/* location of the item, which is remaining at the same place */
+	unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size;
+
+	/* shift */
+	memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc,
+		unmoved_loc - last_loc - cut_size);
+
+	/* change item length */
+	put_ih_item_len(ih, ih_item_len(ih) - cut_size);
 
+	if (is_indirect_le_ih(ih)) {
+		if (pos_in_item)
+			set_ih_free_space(ih, 0);
+	}
+
+	/* change locations */
+	for (i = cut_item_num; i < nr; i++)
+		put_ih_location(&(ih[i - cut_item_num]),
+				ih_location(&ih[i - cut_item_num]) + cut_size);
+
+	/* size, free space */
+	set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size);
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc;
+		t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc, dc_size(t_dc) - cut_size);
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
 
 /* delete del_num items from buffer starting from the first'th item */
-static void leaf_delete_items_entirely (struct buffer_info * bi,
-					int first, int del_num)
+static void leaf_delete_items_entirely(struct buffer_info *bi,
+				       int first, int del_num)
 {
-    struct buffer_head * bh = bi->bi_bh;
-    int nr;
-    int i, j;
-    int last_loc, last_removed_loc;
-    struct block_head * blkh;
-    struct item_head * ih;
-
-  RFALSE( bh == NULL, "10210: buffer is 0");
-  RFALSE( del_num < 0, "10215: del_num less than 0 (%d)", del_num);
-
-  if (del_num == 0)
-    return;
-
-  blkh = B_BLK_HEAD(bh);
-  nr = blkh_nr_item(blkh);
-
-  RFALSE( first < 0 || first + del_num > nr,
-          "10220: first=%d, number=%d, there is %d items", first, del_num, nr);
-
-  if (first == 0 && del_num == nr) {
-    /* this does not work */
-    make_empty_node (bi);
-    
-    do_balance_mark_leaf_dirty (bi->tb, bh, 0);
-    return;
-  }
-
-  ih = B_N_PITEM_HEAD (bh, first);
-  
-  /* location of unmovable item */
-  j = (first == 0) ? bh->b_size : ih_location(ih-1);
-      
-  /* delete items */
-  last_loc = ih_location( &(ih[nr-1-first]) );
-  last_removed_loc = ih_location( &(ih[del_num-1]) );
-
-  memmove (bh->b_data + last_loc + j - last_removed_loc,
-	   bh->b_data + last_loc, last_removed_loc - last_loc);
-  
-  /* delete item headers */
-  memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
-  
-  /* change item location */
-  for (i = first; i < nr - del_num; i ++)
-    put_ih_location( &(ih[i-first]), ih_location( &(ih[i-first]) ) + (j - last_removed_loc) );
-
-  /* sizes, item number */
-  set_blkh_nr_item( blkh, blkh_nr_item(blkh) - del_num );
-  set_blkh_free_space( blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num) );
-
-  do_balance_mark_leaf_dirty (bi->tb, bh, 0);
-  
-  if (bi->bi_parent) {
-    struct disk_child *t_dc = B_N_CHILD (bi->bi_parent, bi->bi_position);
-    put_dc_size( t_dc, dc_size(t_dc) -
-				(j - last_removed_loc + IH_SIZE * del_num));
-    do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0);
-  }
-}
+	struct buffer_head *bh = bi->bi_bh;
+	int nr;
+	int i, j;
+	int last_loc, last_removed_loc;
+	struct block_head *blkh;
+	struct item_head *ih;
+
+	RFALSE(bh == NULL, "10210: buffer is 0");
+	RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num);
+
+	if (del_num == 0)
+		return;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
 
+	RFALSE(first < 0 || first + del_num > nr,
+	       "10220: first=%d, number=%d, there is %d items", first, del_num,
+	       nr);
+
+	if (first == 0 && del_num == nr) {
+		/* this does not work */
+		make_empty_node(bi);
+
+		do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+		return;
+	}
 
+	ih = B_N_PITEM_HEAD(bh, first);
 
+	/* location of unmovable item */
+	j = (first == 0) ? bh->b_size : ih_location(ih - 1);
 
+	/* delete items */
+	last_loc = ih_location(&(ih[nr - 1 - first]));
+	last_removed_loc = ih_location(&(ih[del_num - 1]));
+
+	memmove(bh->b_data + last_loc + j - last_removed_loc,
+		bh->b_data + last_loc, last_removed_loc - last_loc);
+
+	/* delete item headers */
+	memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE);
+
+	/* change item location */
+	for (i = first; i < nr - del_num; i++)
+		put_ih_location(&(ih[i - first]),
+				ih_location(&(ih[i - first])) + (j -
+								 last_removed_loc));
+
+	/* sizes, item number */
+	set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num);
+	set_blkh_free_space(blkh,
+			    blkh_free_space(blkh) + (j - last_removed_loc +
+						     IH_SIZE * del_num));
+
+	do_balance_mark_leaf_dirty(bi->tb, bh, 0);
+
+	if (bi->bi_parent) {
+		struct disk_child *t_dc =
+		    B_N_CHILD(bi->bi_parent, bi->bi_position);
+		put_dc_size(t_dc,
+			    dc_size(t_dc) - (j - last_removed_loc +
+					     IH_SIZE * del_num));
+		do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0);
+	}
+}
 
 /* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */
-void    leaf_paste_entries (
-			struct buffer_head * bh,
+void leaf_paste_entries(struct buffer_head *bh,
 			int item_num,
 			int before,
 			int new_entry_count,
-			struct reiserfs_de_head * new_dehs,
-			const char * records,
-			int paste_size
-		)
+			struct reiserfs_de_head *new_dehs,
+			const char *records, int paste_size)
 {
-    struct item_head * ih;
-    char * item;
-    struct reiserfs_de_head * deh;
-    char * insert_point;
-    int i, old_entry_num;
-
-    if (new_entry_count == 0)
-        return;
-
-    ih = B_N_PITEM_HEAD(bh, item_num);
-
-  /* make sure, that item is directory, and there are enough records in it */
-  RFALSE( !is_direntry_le_ih (ih), "10225: item is not directory item");
-  RFALSE( I_ENTRY_COUNT (ih) < before,
-	  "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
-	  I_ENTRY_COUNT (ih), before);
-
-
-  /* first byte of dest item */
-  item = bh->b_data + ih_location(ih);
-
-  /* entry head array */
-  deh = B_I_DEH (bh, ih);
-
-  /* new records will be pasted at this point */
-  insert_point = item + (before ? deh_location( &(deh[before - 1])) : (ih_item_len(ih) - paste_size));
-
-  /* adjust locations of records that will be AFTER new records */
-  for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --)
-    put_deh_location( &(deh[i]),
-                deh_location(&(deh[i])) + (DEH_SIZE * new_entry_count )); 
-
-  /* adjust locations of records that will be BEFORE new records */
-  for (i = 0; i < before; i ++)
-    put_deh_location( &(deh[i]), deh_location(&(deh[i])) + paste_size );
-
-  old_entry_num = I_ENTRY_COUNT(ih);
-  put_ih_entry_count( ih, ih_entry_count(ih) + new_entry_count );
-
-  /* prepare space for pasted records */
-  memmove (insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point);
-
-  /* copy new records */
-  memcpy (insert_point + DEH_SIZE * new_entry_count, records,
-		   paste_size - DEH_SIZE * new_entry_count);
-  
-  /* prepare space for new entry heads */
-  deh += before;
-  memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh);
-
-  /* copy new entry heads */
-  deh = (struct reiserfs_de_head *)((char *)deh);
-  memcpy (deh, new_dehs, DEH_SIZE * new_entry_count);
-
-  /* set locations of new records */
-  for (i = 0; i < new_entry_count; i ++)
-  {
-    put_deh_location( &(deh[i]),
-        deh_location( &(deh[i] )) +
-        (- deh_location( &(new_dehs[new_entry_count - 1])) +
-        insert_point + DEH_SIZE * new_entry_count - item));
-  }
-
-
-  /* change item key if necessary (when we paste before 0-th entry */
-  if (!before)
-    {
-	set_le_ih_k_offset (ih, deh_offset(new_dehs));
+	struct item_head *ih;
+	char *item;
+	struct reiserfs_de_head *deh;
+	char *insert_point;
+	int i, old_entry_num;
+
+	if (new_entry_count == 0)
+		return;
+
+	ih = B_N_PITEM_HEAD(bh, item_num);
+
+	/* make sure, that item is directory, and there are enough records in it */
+	RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item");
+	RFALSE(I_ENTRY_COUNT(ih) < before,
+	       "10230: there are no entry we paste entries before. entry_count = %d, before = %d",
+	       I_ENTRY_COUNT(ih), before);
+
+	/* first byte of dest item */
+	item = bh->b_data + ih_location(ih);
+
+	/* entry head array */
+	deh = B_I_DEH(bh, ih);
+
+	/* new records will be pasted at this point */
+	insert_point =
+	    item +
+	    (before ? deh_location(&(deh[before - 1]))
+	     : (ih_item_len(ih) - paste_size));
+
+	/* adjust locations of records that will be AFTER new records */
+	for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i--)
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) +
+				 (DEH_SIZE * new_entry_count));
+
+	/* adjust locations of records that will be BEFORE new records */
+	for (i = 0; i < before; i++)
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) + paste_size);
+
+	old_entry_num = I_ENTRY_COUNT(ih);
+	put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count);
+
+	/* prepare space for pasted records */
+	memmove(insert_point + paste_size, insert_point,
+		item + (ih_item_len(ih) - paste_size) - insert_point);
+
+	/* copy new records */
+	memcpy(insert_point + DEH_SIZE * new_entry_count, records,
+	       paste_size - DEH_SIZE * new_entry_count);
+
+	/* prepare space for new entry heads */
+	deh += before;
+	memmove((char *)(deh + new_entry_count), deh,
+		insert_point - (char *)deh);
+
+	/* copy new entry heads */
+	deh = (struct reiserfs_de_head *)((char *)deh);
+	memcpy(deh, new_dehs, DEH_SIZE * new_entry_count);
+
+	/* set locations of new records */
+	for (i = 0; i < new_entry_count; i++) {
+		put_deh_location(&(deh[i]),
+				 deh_location(&(deh[i])) +
+				 (-deh_location
+				  (&(new_dehs[new_entry_count - 1])) +
+				  insert_point + DEH_SIZE * new_entry_count -
+				  item));
+	}
+
+	/* change item key if necessary (when we paste before 0-th entry */
+	if (!before) {
+		set_le_ih_k_offset(ih, deh_offset(new_dehs));
 /*      memcpy (&ih->ih_key.k_offset, 
 		       &new_dehs->deh_offset, SHORT_KEY_SIZE);*/
-    }
-
+	}
 #ifdef CONFIG_REISERFS_CHECK
-  {
-    int prev, next;
-    /* check record locations */
-    deh = B_I_DEH (bh, ih);
-    for (i = 0; i < I_ENTRY_COUNT(ih); i ++) {
-      next = (i < I_ENTRY_COUNT(ih) - 1) ? deh_location( &(deh[i + 1])) : 0;
-      prev = (i != 0) ? deh_location( &(deh[i - 1]) ) : 0;
-      
-      if (prev && prev <= deh_location( &(deh[i])))
-	reiserfs_warning (NULL, "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
-			  ih, deh + i - 1, i, deh + i);
-      if (next && next >= deh_location( &(deh[i])))
-	reiserfs_warning (NULL, "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
-			  ih, i, deh + i, deh + i + 1);
-    }
-  }
+	{
+		int prev, next;
+		/* check record locations */
+		deh = B_I_DEH(bh, ih);
+		for (i = 0; i < I_ENTRY_COUNT(ih); i++) {
+			next =
+			    (i <
+			     I_ENTRY_COUNT(ih) -
+			     1) ? deh_location(&(deh[i + 1])) : 0;
+			prev = (i != 0) ? deh_location(&(deh[i - 1])) : 0;
+
+			if (prev && prev <= deh_location(&(deh[i])))
+				reiserfs_warning(NULL,
+						 "vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)",
+						 ih, deh + i - 1, i, deh + i);
+			if (next && next >= deh_location(&(deh[i])))
+				reiserfs_warning(NULL,
+						 "vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)",
+						 ih, i, deh + i, deh + i + 1);
+		}
+	}
 #endif
 
 }
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4a333255f27a..a20bbc1642dc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -25,86 +25,85 @@
 
 // directory item contains array of entry headers. This performs
 // binary search through that array
-static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off)
+static int bin_search_in_dir_item(struct reiserfs_dir_entry *de, loff_t off)
 {
-    struct item_head * ih = de->de_ih;
-    struct reiserfs_de_head * deh = de->de_deh;
-    int rbound, lbound, j;
-
-    lbound = 0;
-    rbound = I_ENTRY_COUNT (ih) - 1;
-
-    for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) {
-	if (off < deh_offset (deh + j)) {
-	    rbound = j - 1;
-	    continue;
+	struct item_head *ih = de->de_ih;
+	struct reiserfs_de_head *deh = de->de_deh;
+	int rbound, lbound, j;
+
+	lbound = 0;
+	rbound = I_ENTRY_COUNT(ih) - 1;
+
+	for (j = (rbound + lbound) / 2; lbound <= rbound;
+	     j = (rbound + lbound) / 2) {
+		if (off < deh_offset(deh + j)) {
+			rbound = j - 1;
+			continue;
+		}
+		if (off > deh_offset(deh + j)) {
+			lbound = j + 1;
+			continue;
+		}
+		// this is not name found, but matched third key component
+		de->de_entry_num = j;
+		return NAME_FOUND;
 	}
-	if (off > deh_offset (deh + j)) {
-	    lbound = j + 1;
-	    continue;
-	}
-	// this is not name found, but matched third key component
-	de->de_entry_num = j;
-	return NAME_FOUND;
-    }
 
-    de->de_entry_num = lbound;
-    return NAME_NOT_FOUND;
+	de->de_entry_num = lbound;
+	return NAME_NOT_FOUND;
 }
 
-
 // comment?  maybe something like set de to point to what the path points to?
-static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path)
+static inline void set_de_item_location(struct reiserfs_dir_entry *de,
+					struct path *path)
 {
-    de->de_bh = get_last_bh (path);
-    de->de_ih = get_ih (path);
-    de->de_deh = B_I_DEH (de->de_bh, de->de_ih);
-    de->de_item_num = PATH_LAST_POSITION (path);
-} 
-
+	de->de_bh = get_last_bh(path);
+	de->de_ih = get_ih(path);
+	de->de_deh = B_I_DEH(de->de_bh, de->de_ih);
+	de->de_item_num = PATH_LAST_POSITION(path);
+}
 
 // de_bh, de_ih, de_deh (points to first element of array), de_item_num is set
-inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de)
+inline void set_de_name_and_namelen(struct reiserfs_dir_entry *de)
 {
-    struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
 
-    if (de->de_entry_num >= ih_entry_count (de->de_ih))
-	BUG ();
+	if (de->de_entry_num >= ih_entry_count(de->de_ih))
+		BUG();
 
-    de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num);
-    de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0);
-    de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + deh_location(deh);
-    if (de->de_name[de->de_namelen - 1] == 0)
-	de->de_namelen = strlen (de->de_name);
+	de->de_entrylen = entry_length(de->de_bh, de->de_ih, de->de_entry_num);
+	de->de_namelen = de->de_entrylen - (de_with_sd(deh) ? SD_SIZE : 0);
+	de->de_name = B_I_PITEM(de->de_bh, de->de_ih) + deh_location(deh);
+	if (de->de_name[de->de_namelen - 1] == 0)
+		de->de_namelen = strlen(de->de_name);
 }
 
-
 // what entry points to
-static inline void set_de_object_key (struct reiserfs_dir_entry * de)
+static inline void set_de_object_key(struct reiserfs_dir_entry *de)
 {
-    if (de->de_entry_num >= ih_entry_count (de->de_ih))
-	BUG ();
-    de->de_dir_id = deh_dir_id( &(de->de_deh[de->de_entry_num]));
-    de->de_objectid = deh_objectid( &(de->de_deh[de->de_entry_num]));
+	if (de->de_entry_num >= ih_entry_count(de->de_ih))
+		BUG();
+	de->de_dir_id = deh_dir_id(&(de->de_deh[de->de_entry_num]));
+	de->de_objectid = deh_objectid(&(de->de_deh[de->de_entry_num]));
 }
 
-
-static inline void store_de_entry_key (struct reiserfs_dir_entry * de)
+static inline void store_de_entry_key(struct reiserfs_dir_entry *de)
 {
-    struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num;
-
-    if (de->de_entry_num >= ih_entry_count (de->de_ih))
-	BUG ();
-
-    /* store key of the found entry */
-    de->de_entry_key.version = KEY_FORMAT_3_5;
-    de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id);
-    de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid);
-    set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh));
-    set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY);
+	struct reiserfs_de_head *deh = de->de_deh + de->de_entry_num;
+
+	if (de->de_entry_num >= ih_entry_count(de->de_ih))
+		BUG();
+
+	/* store key of the found entry */
+	de->de_entry_key.version = KEY_FORMAT_3_5;
+	de->de_entry_key.on_disk_key.k_dir_id =
+	    le32_to_cpu(de->de_ih->ih_key.k_dir_id);
+	de->de_entry_key.on_disk_key.k_objectid =
+	    le32_to_cpu(de->de_ih->ih_key.k_objectid);
+	set_cpu_key_k_offset(&(de->de_entry_key), deh_offset(deh));
+	set_cpu_key_k_type(&(de->de_entry_key), TYPE_DIRENTRY);
 }
 
-
 /* We assign a key to each directory item, and place multiple entries
 in a single directory item.  A directory item has a key equal to the
 key of the first directory entry in it.
@@ -117,58 +116,60 @@ entry position in the item
 */
 
 /* The function is NOT SCHEDULE-SAFE! */
-int search_by_entry_key (struct super_block * sb, const struct cpu_key * key,
-			 struct path * path, struct reiserfs_dir_entry * de)
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+			struct path *path, struct reiserfs_dir_entry *de)
 {
-    int retval;
-
-    retval = search_item (sb, key, path);
-    switch (retval) {
-    case ITEM_NOT_FOUND:
-	if (!PATH_LAST_POSITION (path)) {
-	    reiserfs_warning (sb, "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
-	    pathrelse(path) ;
-	    return IO_ERROR ;
+	int retval;
+
+	retval = search_item(sb, key, path);
+	switch (retval) {
+	case ITEM_NOT_FOUND:
+		if (!PATH_LAST_POSITION(path)) {
+			reiserfs_warning(sb,
+					 "vs-7000: search_by_entry_key: search_by_key returned item position == 0");
+			pathrelse(path);
+			return IO_ERROR;
+		}
+		PATH_LAST_POSITION(path)--;
+
+	case ITEM_FOUND:
+		break;
+
+	case IO_ERROR:
+		return retval;
+
+	default:
+		pathrelse(path);
+		reiserfs_warning(sb,
+				 "vs-7002: search_by_entry_key: no path to here");
+		return IO_ERROR;
 	}
-	PATH_LAST_POSITION (path) --;
-
-    case ITEM_FOUND:
-	break;
-
-    case IO_ERROR:
-	return retval;
 
-    default:
-	pathrelse (path);
-	reiserfs_warning (sb, "vs-7002: search_by_entry_key: no path to here");
-	return IO_ERROR;
-    }
-
-    set_de_item_location (de, path);
+	set_de_item_location(de, path);
 
 #ifdef CONFIG_REISERFS_CHECK
-    if (!is_direntry_le_ih (de->de_ih) || 
-	COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) {
-	print_block (de->de_bh, 0, -1, -1);
-	reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or "
-                        "does not belong to the same directory as key %K", de->de_ih, key);
-    }
-#endif /* CONFIG_REISERFS_CHECK */
-
-    /* binary search in directory item by third componen t of the
-       key. sets de->de_entry_num of de */
-    retval = bin_search_in_dir_item (de, cpu_key_k_offset (key));
-    path->pos_in_item = de->de_entry_num;
-    if (retval != NAME_NOT_FOUND) {
-	// ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
-	set_de_name_and_namelen (de);
-	set_de_object_key (de);
-    }
-    return retval;
+	if (!is_direntry_le_ih(de->de_ih) ||
+	    COMP_SHORT_KEYS(&(de->de_ih->ih_key), key)) {
+		print_block(de->de_bh, 0, -1, -1);
+		reiserfs_panic(sb,
+			       "vs-7005: search_by_entry_key: found item %h is not directory item or "
+			       "does not belong to the same directory as key %K",
+			       de->de_ih, key);
+	}
+#endif				/* CONFIG_REISERFS_CHECK */
+
+	/* binary search in directory item by third componen t of the
+	   key. sets de->de_entry_num of de */
+	retval = bin_search_in_dir_item(de, cpu_key_k_offset(key));
+	path->pos_in_item = de->de_entry_num;
+	if (retval != NAME_NOT_FOUND) {
+		// ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set
+		set_de_name_and_namelen(de);
+		set_de_object_key(de);
+	}
+	return retval;
 }
 
-
-
 /* Keyed 32-bit hash function using TEA in a Davis-Meyer function */
 
 /* The third component is hashed, and you can choose from more than
@@ -176,197 +177,210 @@ int search_by_entry_key (struct super_block * sb, const struct cpu_key * key,
    but are thought about. This function should be moved to hashes.c
    Jedi, please do so.  -Hans */
 
-static __u32 get_third_component (struct super_block * s, 
-				  const char * name, int len)
+static __u32 get_third_component(struct super_block *s,
+				 const char *name, int len)
 {
-    __u32 res;
-
-    if (!len || (len == 1 && name[0] == '.'))
-	return DOT_OFFSET;
-    if (len == 2 && name[0] == '.' && name[1] == '.')
-	return DOT_DOT_OFFSET;
-
-    res = REISERFS_SB(s)->s_hash_function (name, len);
-
-    // take bits from 7-th to 30-th including both bounds
-    res = GET_HASH_VALUE(res);
-    if (res == 0)
-	// needed to have no names before "." and ".." those have hash
-	// value == 0 and generation conters 1 and 2 accordingly
-	res = 128;
-    return res + MAX_GENERATION_NUMBER;
+	__u32 res;
+
+	if (!len || (len == 1 && name[0] == '.'))
+		return DOT_OFFSET;
+	if (len == 2 && name[0] == '.' && name[1] == '.')
+		return DOT_DOT_OFFSET;
+
+	res = REISERFS_SB(s)->s_hash_function(name, len);
+
+	// take bits from 7-th to 30-th including both bounds
+	res = GET_HASH_VALUE(res);
+	if (res == 0)
+		// needed to have no names before "." and ".." those have hash
+		// value == 0 and generation conters 1 and 2 accordingly
+		res = 128;
+	return res + MAX_GENERATION_NUMBER;
 }
 
-
-static int reiserfs_match (struct reiserfs_dir_entry * de, 
-			   const char * name, int namelen)
+static int reiserfs_match(struct reiserfs_dir_entry *de,
+			  const char *name, int namelen)
 {
-    int retval = NAME_NOT_FOUND;
+	int retval = NAME_NOT_FOUND;
 
-    if ((namelen == de->de_namelen) &&
-	!memcmp(de->de_name, name, de->de_namelen))
-	retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE);
+	if ((namelen == de->de_namelen) &&
+	    !memcmp(de->de_name, name, de->de_namelen))
+		retval =
+		    (de_visible(de->de_deh + de->de_entry_num) ? NAME_FOUND :
+		     NAME_FOUND_INVISIBLE);
 
-    return retval;
+	return retval;
 }
 
-
 /* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */
 
 				/* used when hash collisions exist */
 
-
-static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de,
-				      const char * name, int namelen)
+static int linear_search_in_dir_item(struct cpu_key *key,
+				     struct reiserfs_dir_entry *de,
+				     const char *name, int namelen)
 {
-    struct reiserfs_de_head * deh = de->de_deh;
-    int retval;
-    int i;
+	struct reiserfs_de_head *deh = de->de_deh;
+	int retval;
+	int i;
 
-    i = de->de_entry_num;
+	i = de->de_entry_num;
 
-    if (i == I_ENTRY_COUNT (de->de_ih) ||
-	GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) {
-	i --;
-    }
+	if (i == I_ENTRY_COUNT(de->de_ih) ||
+	    GET_HASH_VALUE(deh_offset(deh + i)) !=
+	    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+		i--;
+	}
 
-    RFALSE( de->de_deh != B_I_DEH (de->de_bh, de->de_ih),
-	    "vs-7010: array of entry headers not found");
+	RFALSE(de->de_deh != B_I_DEH(de->de_bh, de->de_ih),
+	       "vs-7010: array of entry headers not found");
 
-    deh += i;
+	deh += i;
 
-    for (; i >= 0; i --, deh --) {
-	if (GET_HASH_VALUE (deh_offset (deh)) !=
-	    GET_HASH_VALUE (cpu_key_k_offset (key))) {
-	    // hash value does not match, no need to check whole name
-	    return NAME_NOT_FOUND;
-	}
-   
-	/* mark, that this generation number is used */
-	if (de->de_gen_number_bit_string)
-	    set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), (unsigned long *)de->de_gen_number_bit_string);
+	for (; i >= 0; i--, deh--) {
+		if (GET_HASH_VALUE(deh_offset(deh)) !=
+		    GET_HASH_VALUE(cpu_key_k_offset(key))) {
+			// hash value does not match, no need to check whole name
+			return NAME_NOT_FOUND;
+		}
+
+		/* mark, that this generation number is used */
+		if (de->de_gen_number_bit_string)
+			set_bit(GET_GENERATION_NUMBER(deh_offset(deh)),
+				(unsigned long *)de->de_gen_number_bit_string);
 
-	// calculate pointer to name and namelen
-	de->de_entry_num = i;
-	set_de_name_and_namelen (de);
+		// calculate pointer to name and namelen
+		de->de_entry_num = i;
+		set_de_name_and_namelen(de);
 
-	if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) {
-	    // de's de_name, de_namelen, de_recordlen are set. Fill the rest:
+		if ((retval =
+		     reiserfs_match(de, name, namelen)) != NAME_NOT_FOUND) {
+			// de's de_name, de_namelen, de_recordlen are set. Fill the rest:
 
-	    // key of pointed object
-	    set_de_object_key (de);
+			// key of pointed object
+			set_de_object_key(de);
 
-	    store_de_entry_key (de);
+			store_de_entry_key(de);
 
-	    // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
-	    return retval;
+			// retval can be NAME_FOUND or NAME_FOUND_INVISIBLE
+			return retval;
+		}
 	}
-    }
-
-    if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0)
-	/* we have reached left most entry in the node. In common we
-           have to go to the left neighbor, but if generation counter
-           is 0 already, we know for sure, that there is no name with
-           the same hash value */
-	// FIXME: this work correctly only because hash value can not
-	// be 0. Btw, in case of Yura's hash it is probably possible,
-	// so, this is a bug
-	return NAME_NOT_FOUND;
 
-    RFALSE( de->de_item_num,
-	    "vs-7015: two diritems of the same directory in one node?");
+	if (GET_GENERATION_NUMBER(le_ih_k_offset(de->de_ih)) == 0)
+		/* we have reached left most entry in the node. In common we
+		   have to go to the left neighbor, but if generation counter
+		   is 0 already, we know for sure, that there is no name with
+		   the same hash value */
+		// FIXME: this work correctly only because hash value can not
+		// be 0. Btw, in case of Yura's hash it is probably possible,
+		// so, this is a bug
+		return NAME_NOT_FOUND;
 
-    return GOTO_PREVIOUS_ITEM;
-}
+	RFALSE(de->de_item_num,
+	       "vs-7015: two diritems of the same directory in one node?");
 
+	return GOTO_PREVIOUS_ITEM;
+}
 
 // may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND
 // FIXME: should add something like IOERROR
-static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen, 
-				struct path * path_to_entry, struct reiserfs_dir_entry * de)
+static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen,
+			       struct path *path_to_entry,
+			       struct reiserfs_dir_entry *de)
 {
-    struct cpu_key key_to_search;
-    int retval;
-
-
-    if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
-	return NAME_NOT_FOUND;
-
-    /* we will search for this key in the tree */
-    make_cpu_key (&key_to_search, dir, 
-		  get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
-
-    while (1) {
-	retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de);
-	if (retval == IO_ERROR) {
-	    reiserfs_warning (dir->i_sb, "zam-7001: io error in %s",
-			      __FUNCTION__);
-	    return IO_ERROR;
-	}
-
-	/* compare names for all entries having given hash value */
-	retval = linear_search_in_dir_item (&key_to_search, de, name, namelen);
-	if (retval != GOTO_PREVIOUS_ITEM) {
-	    /* there is no need to scan directory anymore. Given entry found or does not exist */
-	    path_to_entry->pos_in_item = de->de_entry_num;
-	    return retval;
-	}
-
-	/* there is left neighboring item of this directory and given entry can be there */
-	set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1);
-	pathrelse (path_to_entry);
-
-    } /* while (1) */
+	struct cpu_key key_to_search;
+	int retval;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return NAME_NOT_FOUND;
+
+	/* we will search for this key in the tree */
+	make_cpu_key(&key_to_search, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	while (1) {
+		retval =
+		    search_by_entry_key(dir->i_sb, &key_to_search,
+					path_to_entry, de);
+		if (retval == IO_ERROR) {
+			reiserfs_warning(dir->i_sb, "zam-7001: io error in %s",
+					 __FUNCTION__);
+			return IO_ERROR;
+		}
+
+		/* compare names for all entries having given hash value */
+		retval =
+		    linear_search_in_dir_item(&key_to_search, de, name,
+					      namelen);
+		if (retval != GOTO_PREVIOUS_ITEM) {
+			/* there is no need to scan directory anymore. Given entry found or does not exist */
+			path_to_entry->pos_in_item = de->de_entry_num;
+			return retval;
+		}
+
+		/* there is left neighboring item of this directory and given entry can be there */
+		set_cpu_key_k_offset(&key_to_search,
+				     le_ih_k_offset(de->de_ih) - 1);
+		pathrelse(path_to_entry);
+
+	}			/* while (1) */
 }
 
-
-static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
+				      struct nameidata *nd)
 {
-    int retval;
-    struct inode * inode = NULL;
-    struct reiserfs_dir_entry de;
-    INITIALIZE_PATH (path_to_entry);
-
-    if (REISERFS_MAX_NAME (dir->i_sb->s_blocksize) < dentry->d_name.len)
-	return ERR_PTR(-ENAMETOOLONG);
-
-    reiserfs_write_lock(dir->i_sb);
-    de.de_gen_number_bit_string = NULL;
-    retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de);
-    pathrelse (&path_to_entry);
-    if (retval == NAME_FOUND) {
-        /* Hide the .reiserfs_priv directory */
-	if (reiserfs_xattrs (dir->i_sb) &&
-	    !old_format_only(dir->i_sb) &&
-            REISERFS_SB(dir->i_sb)->priv_root &&
-            REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
-	    de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) {
-	  reiserfs_write_unlock (dir->i_sb);
-	  return ERR_PTR (-EACCES);
+	int retval;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+
+	if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	reiserfs_write_lock(dir->i_sb);
+	de.de_gen_number_bit_string = NULL;
+	retval =
+	    reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				&path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval == NAME_FOUND) {
+		/* Hide the .reiserfs_priv directory */
+		if (reiserfs_xattrs(dir->i_sb) &&
+		    !old_format_only(dir->i_sb) &&
+		    REISERFS_SB(dir->i_sb)->priv_root &&
+		    REISERFS_SB(dir->i_sb)->priv_root->d_inode &&
+		    de.de_objectid ==
+		    le32_to_cpu(INODE_PKEY
+				(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->
+				k_objectid)) {
+			reiserfs_write_unlock(dir->i_sb);
+			return ERR_PTR(-EACCES);
+		}
+
+		inode =
+		    reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
+		if (!inode || IS_ERR(inode)) {
+			reiserfs_write_unlock(dir->i_sb);
+			return ERR_PTR(-EACCES);
+		}
+
+		/* Propogate the priv_object flag so we know we're in the priv tree */
+		if (is_reiserfs_priv_object(dir))
+			reiserfs_mark_inode_private(inode);
+	}
+	reiserfs_write_unlock(dir->i_sb);
+	if (retval == IO_ERROR) {
+		return ERR_PTR(-EIO);
 	}
 
-	inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
-	if (!inode || IS_ERR(inode)) {
-	    reiserfs_write_unlock(dir->i_sb);
-	    return ERR_PTR(-EACCES);
-        }
-
-	/* Propogate the priv_object flag so we know we're in the priv tree */
-	if (is_reiserfs_priv_object (dir))
-	    reiserfs_mark_inode_private (inode);
-    }
-    reiserfs_write_unlock(dir->i_sb);
-    if ( retval == IO_ERROR ) {
-	return ERR_PTR(-EIO);
-    }
-
-    if (inode)
-	    return d_splice_alias(inode, dentry);
-    
-    d_add(dentry, inode);
-    return NULL;
-}
+	if (inode)
+		return d_splice_alias(inode, dentry);
 
+	d_add(dentry, inode);
+	return NULL;
+}
 
 /* 
 ** looks up the dentry of the parent directory for child.
@@ -374,40 +388,38 @@ static struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dent
 */
 struct dentry *reiserfs_get_parent(struct dentry *child)
 {
-    int retval;
-    struct inode * inode = NULL;
-    struct reiserfs_dir_entry de;
-    INITIALIZE_PATH (path_to_entry);
-    struct dentry *parent;
-    struct inode *dir = child->d_inode ;
-
-
-    if (dir->i_nlink == 0) {
-	return ERR_PTR(-ENOENT);
-    }
-    de.de_gen_number_bit_string = NULL;
-
-    reiserfs_write_lock(dir->i_sb);
-    retval = reiserfs_find_entry (dir, "..", 2, &path_to_entry, &de);
-    pathrelse (&path_to_entry);
-    if (retval != NAME_FOUND) {
+	int retval;
+	struct inode *inode = NULL;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path_to_entry);
+	struct dentry *parent;
+	struct inode *dir = child->d_inode;
+
+	if (dir->i_nlink == 0) {
+		return ERR_PTR(-ENOENT);
+	}
+	de.de_gen_number_bit_string = NULL;
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = reiserfs_find_entry(dir, "..", 2, &path_to_entry, &de);
+	pathrelse(&path_to_entry);
+	if (retval != NAME_FOUND) {
+		reiserfs_write_unlock(dir->i_sb);
+		return ERR_PTR(-ENOENT);
+	}
+	inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
 	reiserfs_write_unlock(dir->i_sb);
-	return ERR_PTR(-ENOENT);
-    }
-    inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id));
-    reiserfs_write_unlock(dir->i_sb);
-
-    if (!inode || IS_ERR(inode)) {
-	return ERR_PTR(-EACCES);
-    }
-    parent = d_alloc_anon(inode);
-    if (!parent) {
-	iput(inode);
-	parent = ERR_PTR(-ENOMEM);
-    }
-    return parent;
-}
 
+	if (!inode || IS_ERR(inode)) {
+		return ERR_PTR(-EACCES);
+	}
+	parent = d_alloc_anon(inode);
+	if (!parent) {
+		iput(inode);
+		parent = ERR_PTR(-ENOMEM);
+	}
+	return parent;
+}
 
 /* add entry to the directory (entry can be hidden). 
 
@@ -415,132 +427,143 @@ insert definition of when hidden directories are used here -Hans
 
  Does not mark dir   inode dirty, do it after successesfull call to it */
 
-static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir,
-                               const char * name, int namelen, struct inode * inode,
-			       int visible)
+static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
+			      struct inode *dir, const char *name, int namelen,
+			      struct inode *inode, int visible)
 {
-    struct cpu_key entry_key;
-    struct reiserfs_de_head * deh;
-    INITIALIZE_PATH (path);
-    struct reiserfs_dir_entry de;
-    int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1];
-    int gen_number;
-    char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc
-                                     if we create file with short name */
-    char * buffer;
-    int buflen, paste_size;
-    int retval;
-
-    BUG_ON (!th->t_trans_id);
-
-    /* cannot allow items to be added into a busy deleted directory */
-    if (!namelen)
-	return -EINVAL;
-
-    if (namelen > REISERFS_MAX_NAME (dir->i_sb->s_blocksize))
-	return -ENAMETOOLONG;
-
-    /* each entry has unique key. compose it */
-    make_cpu_key (&entry_key, dir, 
-		  get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3);
-
-    /* get memory for composing the entry */
-    buflen = DEH_SIZE + ROUND_UP (namelen);
-    if (buflen > sizeof (small_buf)) {
-	buffer = reiserfs_kmalloc (buflen, GFP_NOFS, dir->i_sb);
-	if (buffer == 0)
-	    return -ENOMEM;
-    } else
-	buffer = small_buf;
-
-    paste_size = (get_inode_sd_version (dir) == STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
-
-    /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
-    deh = (struct reiserfs_de_head *)buffer;
-    deh->deh_location = 0; /* JDM Endian safe if 0 */
-    put_deh_offset( deh, cpu_key_k_offset( &entry_key ) );
-    deh->deh_state = 0; /* JDM Endian safe if 0 */
-    /* put key (ino analog) to de */
-    deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; /* safe: k_dir_id is le */
-    deh->deh_objectid = INODE_PKEY (inode)->k_objectid; /* safe: k_objectid is le */
-
-    /* copy name */
-    memcpy ((char *)(deh + 1), name, namelen);
-    /* padd by 0s to the 4 byte boundary */
-    padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen);
-
-    /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
-    mark_de_without_sd (deh);
-    visible ? mark_de_visible (deh) : mark_de_hidden (deh);
-
-    /* find the proper place for the new entry */
-    memset (bit_string, 0, sizeof (bit_string));
-    de.de_gen_number_bit_string = (char *)bit_string;
-    retval = reiserfs_find_entry (dir, name, namelen, &path, &de);
-    if( retval != NAME_NOT_FOUND ) {
-	if (buffer != small_buf)
-	    reiserfs_kfree (buffer, buflen, dir->i_sb);
-	pathrelse (&path);
+	struct cpu_key entry_key;
+	struct reiserfs_de_head *deh;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1];
+	int gen_number;
+	char small_buf[32 + DEH_SIZE];	/* 48 bytes now and we avoid kmalloc
+					   if we create file with short name */
+	char *buffer;
+	int buflen, paste_size;
+	int retval;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* cannot allow items to be added into a busy deleted directory */
+	if (!namelen)
+		return -EINVAL;
+
+	if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
+		return -ENAMETOOLONG;
+
+	/* each entry has unique key. compose it */
+	make_cpu_key(&entry_key, dir,
+		     get_third_component(dir->i_sb, name, namelen),
+		     TYPE_DIRENTRY, 3);
+
+	/* get memory for composing the entry */
+	buflen = DEH_SIZE + ROUND_UP(namelen);
+	if (buflen > sizeof(small_buf)) {
+		buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb);
+		if (buffer == 0)
+			return -ENOMEM;
+	} else
+		buffer = small_buf;
+
+	paste_size =
+	    (get_inode_sd_version(dir) ==
+	     STAT_DATA_V1) ? (DEH_SIZE + namelen) : buflen;
+
+	/* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */
+	deh = (struct reiserfs_de_head *)buffer;
+	deh->deh_location = 0;	/* JDM Endian safe if 0 */
+	put_deh_offset(deh, cpu_key_k_offset(&entry_key));
+	deh->deh_state = 0;	/* JDM Endian safe if 0 */
+	/* put key (ino analog) to de */
+	deh->deh_dir_id = INODE_PKEY(inode)->k_dir_id;	/* safe: k_dir_id is le */
+	deh->deh_objectid = INODE_PKEY(inode)->k_objectid;	/* safe: k_objectid is le */
+
+	/* copy name */
+	memcpy((char *)(deh + 1), name, namelen);
+	/* padd by 0s to the 4 byte boundary */
+	padd_item((char *)(deh + 1), ROUND_UP(namelen), namelen);
+
+	/* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */
+	mark_de_without_sd(deh);
+	visible ? mark_de_visible(deh) : mark_de_hidden(deh);
+
+	/* find the proper place for the new entry */
+	memset(bit_string, 0, sizeof(bit_string));
+	de.de_gen_number_bit_string = (char *)bit_string;
+	retval = reiserfs_find_entry(dir, name, namelen, &path, &de);
+	if (retval != NAME_NOT_FOUND) {
+		if (buffer != small_buf)
+			reiserfs_kfree(buffer, buflen, dir->i_sb);
+		pathrelse(&path);
+
+		if (retval == IO_ERROR) {
+			return -EIO;
+		}
+
+		if (retval != NAME_FOUND) {
+			reiserfs_warning(dir->i_sb,
+					 "zam-7002:%s: \"reiserfs_find_entry\" "
+					 "has returned unexpected value (%d)",
+					 __FUNCTION__, retval);
+		}
+
+		return -EEXIST;
+	}
 
-	if ( retval == IO_ERROR ) {
-	    return -EIO;
+	gen_number =
+	    find_first_zero_bit((unsigned long *)bit_string,
+				MAX_GENERATION_NUMBER + 1);
+	if (gen_number > MAX_GENERATION_NUMBER) {
+		/* there is no free generation number */
+		reiserfs_warning(dir->i_sb,
+				 "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
+		if (buffer != small_buf)
+			reiserfs_kfree(buffer, buflen, dir->i_sb);
+		pathrelse(&path);
+		return -EBUSY;
+	}
+	/* adjust offset of directory enrty */
+	put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
+	set_cpu_key_k_offset(&entry_key, deh_offset(deh));
+
+	/* update max-hash-collisions counter in reiserfs_sb_info */
+	PROC_INFO_MAX(th->t_super, max_hash_collisions, gen_number);
+
+	if (gen_number != 0) {	/* we need to re-search for the insertion point */
+		if (search_by_entry_key(dir->i_sb, &entry_key, &path, &de) !=
+		    NAME_NOT_FOUND) {
+			reiserfs_warning(dir->i_sb,
+					 "vs-7032: reiserfs_add_entry: "
+					 "entry with this key (%K) already exists",
+					 &entry_key);
+
+			if (buffer != small_buf)
+				reiserfs_kfree(buffer, buflen, dir->i_sb);
+			pathrelse(&path);
+			return -EBUSY;
+		}
 	}
 
-        if (retval != NAME_FOUND) {
-	    reiserfs_warning (dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" "
-			      "has returned unexpected value (%d)",
-			      __FUNCTION__, retval);
-       }
-
-	return -EEXIST;
-    }
-
-    gen_number = find_first_zero_bit ((unsigned long *)bit_string, MAX_GENERATION_NUMBER + 1);
-    if (gen_number > MAX_GENERATION_NUMBER) {
-      /* there is no free generation number */
-      reiserfs_warning (dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up");
-      if (buffer != small_buf)
-          reiserfs_kfree (buffer, buflen, dir->i_sb);
-      pathrelse (&path);
-      return -EBUSY;
-    }
-    /* adjust offset of directory enrty */
-    put_deh_offset(deh, SET_GENERATION_NUMBER(deh_offset(deh), gen_number));
-    set_cpu_key_k_offset (&entry_key, deh_offset(deh));
- 
-    /* update max-hash-collisions counter in reiserfs_sb_info */
-    PROC_INFO_MAX( th -> t_super, max_hash_collisions, gen_number );
- 		  
-    if (gen_number != 0) {	/* we need to re-search for the insertion point */
-      if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) != NAME_NOT_FOUND) {
-            reiserfs_warning (dir->i_sb, "vs-7032: reiserfs_add_entry: "
-                              "entry with this key (%K) already exists",
-                              &entry_key);
-
-	    if (buffer != small_buf)
-		reiserfs_kfree (buffer, buflen, dir->i_sb);
-	    pathrelse (&path);
-	    return -EBUSY;
+	/* perform the insertion of the entry that we have prepared */
+	retval =
+	    reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer,
+				     paste_size);
+	if (buffer != small_buf)
+		reiserfs_kfree(buffer, buflen, dir->i_sb);
+	if (retval) {
+		reiserfs_check_path(&path);
+		return retval;
 	}
-    }
-  
-    /* perform the insertion of the entry that we have prepared */
-    retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size);
-    if (buffer != small_buf)
-	reiserfs_kfree (buffer, buflen, dir->i_sb);
-    if (retval) {
-	reiserfs_check_path(&path) ;
-	return retval;
-    }
 
-    dir->i_size += paste_size;
-    dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
-    if (!S_ISDIR (inode->i_mode) && visible)
-	// reiserfs_mkdir or reiserfs_rename will do that by itself
-	reiserfs_update_sd (th, dir);
+	dir->i_size += paste_size;
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	if (!S_ISDIR(inode->i_mode) && visible)
+		// reiserfs_mkdir or reiserfs_rename will do that by itself
+		reiserfs_update_sd(th, dir);
 
-    reiserfs_check_path(&path) ;
-    return 0;
+	reiserfs_check_path(&path);
+	return 0;
 }
 
 /* quota utility function, call if you've had to abort after calling
@@ -548,12 +571,13 @@ static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct in
 ** This should only be called on inodes that do not have stat data
 ** inserted into the tree yet.
 */
-static int drop_new_inode(struct inode *inode) {
-    DQUOT_DROP(inode);
-    make_bad_inode(inode) ;
-    inode->i_flags |= S_NOQUOTA;
-    iput(inode) ;
-    return 0 ;
+static int drop_new_inode(struct inode *inode)
+{
+	DQUOT_DROP(inode);
+	make_bad_inode(inode);
+	inode->i_flags |= S_NOQUOTA;
+	iput(inode);
+	return 0;
 }
 
 /* utility function that does setup for reiserfs_new_inode.  
@@ -561,905 +585,968 @@ static int drop_new_inode(struct inode *inode) {
 ** outside of a transaction, so we had to pull some bits of
 ** reiserfs_new_inode out into this func.
 */
-static int new_inode_init(struct inode *inode, struct inode *dir, int mode) {
-
-    /* the quota init calls have to know who to charge the quota to, so
-    ** we have to set uid and gid here
-    */
-    inode->i_uid = current->fsuid;
-    inode->i_mode = mode;
-
-    if (dir->i_mode & S_ISGID) {
-        inode->i_gid = dir->i_gid;
-        if (S_ISDIR(mode))
-            inode->i_mode |= S_ISGID;
-    } else {
-        inode->i_gid = current->fsgid;
-    }
-    DQUOT_INIT(inode);
-    return 0 ;
+static int new_inode_init(struct inode *inode, struct inode *dir, int mode)
+{
+
+	/* the quota init calls have to know who to charge the quota to, so
+	 ** we have to set uid and gid here
+	 */
+	inode->i_uid = current->fsuid;
+	inode->i_mode = mode;
+
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			inode->i_mode |= S_ISGID;
+	} else {
+		inode->i_gid = current->fsgid;
+	}
+	DQUOT_INIT(inode);
+	return 0;
 }
 
-static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
-		struct nameidata *nd)
+static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
+			   struct nameidata *nd)
 {
-    int retval;
-    struct inode * inode;
-    /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-    struct reiserfs_transaction_handle th ;
-    int locked;
-
-    if (!(inode = new_inode(dir->i_sb))) {
-	return -ENOMEM ;
-    }
-    new_inode_init(inode, dir, mode);
-
-    locked = reiserfs_cache_default_acl (dir);
-
-    reiserfs_write_lock(dir->i_sb);
-
-    if (locked)
-        reiserfs_write_lock_xattrs (dir->i_sb);
-
-    retval = journal_begin(&th, dir->i_sb, jbegin_count);
-    if (retval) {
-        drop_new_inode (inode);
-        goto out_failed;
-    }
-
-    retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode);
-    if (retval)
-        goto out_failed;
-	
-    if (locked) {
-        reiserfs_write_unlock_xattrs (dir->i_sb);
-        locked = 0;
-    }
-
-    inode->i_op = &reiserfs_file_inode_operations;
-    inode->i_fop = &reiserfs_file_operations;
-    inode->i_mapping->a_ops = &reiserfs_address_space_operations ;
-
-    retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, 
-				inode, 1/*visible*/);
-    if (retval) {
-        int err;
-	inode->i_nlink--;
-	reiserfs_update_sd (&th, inode);
-	err = journal_end(&th, dir->i_sb, jbegin_count) ;
-        if (err)
-            retval = err;
-	iput (inode);
-	goto out_failed;
-    }
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
-
-    d_instantiate(dentry, inode);
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-
-out_failed:
-    if (locked)
-        reiserfs_write_unlock_xattrs (dir->i_sb);
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
-}
+	int retval;
+	struct inode *inode;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+	struct reiserfs_transaction_handle th;
+	int locked;
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
 
+	locked = reiserfs_cache_default_acl(dir);
 
-static int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t rdev)
-{
-    int retval;
-    struct inode * inode;
-    struct reiserfs_transaction_handle th ;
-    /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-    int locked;
+	reiserfs_write_lock(dir->i_sb);
 
-    if (!new_valid_dev(rdev))
-	return -EINVAL;
+	if (locked)
+		reiserfs_write_lock_xattrs(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode);
+	if (retval)
+		goto out_failed;
+
+	if (locked) {
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+		locked = 0;
+	}
+
+	inode->i_op = &reiserfs_file_inode_operations;
+	inode->i_fop = &reiserfs_file_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		iput(inode);
+		goto out_failed;
+	}
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
 
-    if (!(inode = new_inode(dir->i_sb))) {
-	return -ENOMEM ;
-    }
-    new_inode_init(inode, dir, mode);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
-    locked = reiserfs_cache_default_acl (dir);
+      out_failed:
+	if (locked)
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
 
-    reiserfs_write_lock(dir->i_sb);
+static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+			  dev_t rdev)
+{
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+	int locked;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
 
-    if (locked)
-        reiserfs_write_lock_xattrs (dir->i_sb);
+	locked = reiserfs_cache_default_acl(dir);
 
-    retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
-    if (retval) {
-        drop_new_inode (inode);
-        goto out_failed;
-    }
+	reiserfs_write_lock(dir->i_sb);
 
-    retval = reiserfs_new_inode (&th, dir, mode, NULL, 0/*i_size*/, dentry, inode);
-    if (retval) {
-        goto out_failed;
-    }
+	if (locked)
+		reiserfs_write_lock_xattrs(dir->i_sb);
 
-    if (locked) {
-        reiserfs_write_unlock_xattrs (dir->i_sb);
-        locked = 0;
-    }
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
 
+	retval =
+	    reiserfs_new_inode(&th, dir, mode, NULL, 0 /*i_size */ , dentry,
+			       inode);
+	if (retval) {
+		goto out_failed;
+	}
 
-    inode->i_op = &reiserfs_special_inode_operations;
-    init_special_inode(inode, inode->i_mode, rdev) ;
+	if (locked) {
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+		locked = 0;
+	}
 
-    //FIXME: needed for block and char devices only
-    reiserfs_update_sd (&th, inode);
+	inode->i_op = &reiserfs_special_inode_operations;
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	//FIXME: needed for block and char devices only
+	reiserfs_update_sd(&th, inode);
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		iput(inode);
+		goto out_failed;
+	}
 
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
 
-    retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, 
-				 inode, 1/*visible*/);
-    if (retval) {
-        int err;
-	inode->i_nlink--;
-	reiserfs_update_sd (&th, inode);
-	err = journal_end(&th, dir->i_sb, jbegin_count) ;
-        if (err)
-	    retval = err;
-	iput (inode);
-	goto out_failed;
-    }
-
-    d_instantiate(dentry, inode);
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-
-out_failed:
-    if (locked)
-        reiserfs_write_unlock_xattrs (dir->i_sb);
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
+      out_failed:
+	if (locked)
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
 }
 
-
-static int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode)
+static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-    int retval;
-    struct inode * inode;
-    struct reiserfs_transaction_handle th ;
-    /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
-    int locked;
+	int retval;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
+	int locked;
 
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    /* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
-    REISERFS_I(dir)->new_packing_locality = 1;
+	/* set flag that new packing locality created and new blocks for the content     * of that directory are not displaced yet */
+	REISERFS_I(dir)->new_packing_locality = 1;
 #endif
-    mode = S_IFDIR | mode;
-    if (!(inode = new_inode(dir->i_sb))) {
-	return -ENOMEM ;
-    }
-    new_inode_init(inode, dir, mode);
-
-    locked = reiserfs_cache_default_acl (dir);
-
-    reiserfs_write_lock(dir->i_sb);
-    if (locked)
-        reiserfs_write_lock_xattrs (dir->i_sb);
-
-    retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
-    if (retval) {
-        drop_new_inode (inode);
-        goto out_failed;
-    }
-
-
-    /* inc the link count now, so another writer doesn't overflow it while
-    ** we sleep later on.
-    */
-    INC_DIR_INODE_NLINK(dir)
-
-    retval = reiserfs_new_inode (&th, dir, mode, NULL/*symlink*/,
-				old_format_only (dir->i_sb) ? 
-				EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
-				dentry, inode);
-    if (retval) {
-	dir->i_nlink-- ;
-	goto out_failed;
-    }
-
-    if (locked) {
-	reiserfs_write_unlock_xattrs (dir->i_sb);
-	locked = 0;
-    }
-
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
-
-    inode->i_op = &reiserfs_dir_inode_operations;
-    inode->i_fop = &reiserfs_dir_operations;
-
-    // note, _this_ add_entry will not update dir's stat data
-    retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, 
-				inode, 1/*visible*/);
-    if (retval) {
-	int err;
-	inode->i_nlink = 0;
-	DEC_DIR_INODE_NLINK(dir);
-	reiserfs_update_sd (&th, inode);
-	err = journal_end(&th, dir->i_sb, jbegin_count) ;
-	if (err)
-	    retval = err;
-	iput (inode);
-	goto out_failed;
-    }
-
-    // the above add_entry did not update dir's stat data
-    reiserfs_update_sd (&th, dir);
-
-    d_instantiate(dentry, inode);
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-out_failed:
-    if (locked)
-        reiserfs_write_unlock_xattrs (dir->i_sb);
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
-}
+	mode = S_IFDIR | mode;
+	if (!(inode = new_inode(dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, dir, mode);
+
+	locked = reiserfs_cache_default_acl(dir);
+
+	reiserfs_write_lock(dir->i_sb);
+	if (locked)
+		reiserfs_write_lock_xattrs(dir->i_sb);
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		goto out_failed;
+	}
 
-static inline int reiserfs_empty_dir(struct inode *inode) {
-    /* we can cheat because an old format dir cannot have
-    ** EMPTY_DIR_SIZE, and a new format dir cannot have
-    ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size, 
-    ** regardless of disk format version, the directory is empty.
-    */
-    if (inode->i_size != EMPTY_DIR_SIZE &&
-        inode->i_size != EMPTY_DIR_SIZE_V1) {
-        return 0 ;
-    }
-    return 1 ;
+	/* inc the link count now, so another writer doesn't overflow it while
+	 ** we sleep later on.
+	 */
+	INC_DIR_INODE_NLINK(dir)
+
+	    retval = reiserfs_new_inode(&th, dir, mode, NULL /*symlink */ ,
+					old_format_only(dir->i_sb) ?
+					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
+					dentry, inode);
+	if (retval) {
+		dir->i_nlink--;
+		goto out_failed;
+	}
+
+	if (locked) {
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+		locked = 0;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	inode->i_op = &reiserfs_dir_inode_operations;
+	inode->i_fop = &reiserfs_dir_operations;
+
+	// note, _this_ add_entry will not update dir's stat data
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink = 0;
+		DEC_DIR_INODE_NLINK(dir);
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		iput(inode);
+		goto out_failed;
+	}
+	// the above add_entry did not update dir's stat data
+	reiserfs_update_sd(&th, dir);
+
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+      out_failed:
+	if (locked)
+		reiserfs_write_unlock_xattrs(dir->i_sb);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
 }
 
-static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
+static inline int reiserfs_empty_dir(struct inode *inode)
 {
-    int retval, err;
-    struct inode * inode;
-    struct reiserfs_transaction_handle th ;
-    int jbegin_count; 
-    INITIALIZE_PATH (path);
-    struct reiserfs_dir_entry de;
-
-
-    /* we will be doing 2 balancings and update 2 stat data, we change quotas
-     * of the owner of the directory and of the owner of the parent directory.
-     * The quota structure is possibly deleted only on last iput => outside
-     * of this transaction */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-    reiserfs_write_lock(dir->i_sb);
-    retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
-    if (retval)
-        goto out_rmdir;
-
-    de.de_gen_number_bit_string = NULL;
-    if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
-	retval = -ENOENT;
-	goto end_rmdir;
-    } else if ( retval == IO_ERROR) {
-	retval = -EIO;
-	goto end_rmdir;
-    }
-
-    inode = dentry->d_inode;
-
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
-
-    if (de.de_objectid != inode->i_ino) {
-	// FIXME: compare key of an object and a key found in the
-	// entry
-	retval = -EIO;
-	goto end_rmdir;
-    }
-    if (!reiserfs_empty_dir(inode)) {
-	retval = -ENOTEMPTY;
-	goto end_rmdir;
-    }
-
-    /* cut entry from dir directory */
-    retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, 
-                                     NULL, /* page */ 
-				     0/*new file size - not used here*/);
-    if (retval < 0)
-	goto end_rmdir;
-
-    if ( inode->i_nlink != 2 && inode->i_nlink != 1 )
-	reiserfs_warning (inode->i_sb, "%s: empty directory has nlink "
-			  "!= 2 (%d)", __FUNCTION__, inode->i_nlink);
-
-    inode->i_nlink = 0;
-    inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-    reiserfs_update_sd (&th, inode);
-
-    DEC_DIR_INODE_NLINK(dir)
-    dir->i_size -= (DEH_SIZE + de.de_entrylen);
-    reiserfs_update_sd (&th, dir);
-
-    /* prevent empty directory from getting lost */
-    add_save_link (&th, inode, 0/* not truncate */);
-
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-    reiserfs_check_path(&path) ;
-out_rmdir:
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
-	
- end_rmdir:
-    /* we must release path, because we did not call
-       reiserfs_cut_from_item, or reiserfs_cut_from_item does not
-       release path if operation was not complete */
-    pathrelse (&path);
-    err = journal_end(&th, dir->i_sb, jbegin_count) ;
-    reiserfs_write_unlock(dir->i_sb);
-    return err ? err : retval;
+	/* we can cheat because an old format dir cannot have
+	 ** EMPTY_DIR_SIZE, and a new format dir cannot have
+	 ** EMPTY_DIR_SIZE_V1.  So, if the inode is either size, 
+	 ** regardless of disk format version, the directory is empty.
+	 */
+	if (inode->i_size != EMPTY_DIR_SIZE &&
+	    inode->i_size != EMPTY_DIR_SIZE_V1) {
+		return 0;
+	}
+	return 1;
 }
 
-static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
+static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-    int retval, err;
-    struct inode * inode;
-    struct reiserfs_dir_entry de;
-    INITIALIZE_PATH (path);
-    struct reiserfs_transaction_handle th ;
-    int jbegin_count;
-    unsigned long savelink;
-
-    inode = dentry->d_inode;
-
-    /* in this transaction we can be doing at max two balancings and update
-     * two stat datas, we change quotas of the owner of the directory and of
-     * the owner of the parent directory. The quota structure is possibly
-     * deleted only on iput => outside of this transaction */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-    reiserfs_write_lock(dir->i_sb);
-    retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
-    if (retval)
-        goto out_unlink;
-	
-    de.de_gen_number_bit_string = NULL;
-    if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
-	retval = -ENOENT;
-	goto end_unlink;
-    } else if (retval == IO_ERROR) {
-	retval = -EIO;
-	goto end_unlink;
-    }
-
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
-
-    if (de.de_objectid != inode->i_ino) {
-	// FIXME: compare key of an object and a key found in the
-	// entry
-	retval = -EIO;
-	goto end_unlink;
-    }
-  
-    if (!inode->i_nlink) {
-	reiserfs_warning (inode->i_sb, "%s: deleting nonexistent file "
-			  "(%s:%lu), %d", __FUNCTION__,
-			  reiserfs_bdevname (inode->i_sb), inode->i_ino,
-			  inode->i_nlink);
-	inode->i_nlink = 1;
-    }
-
-    inode->i_nlink--;
-
-    /*
-     * we schedule before doing the add_save_link call, save the link
-     * count so we don't race
-     */
-    savelink = inode->i_nlink;
-
-
-    retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0);
-    if (retval < 0) {
-	inode->i_nlink++;
-	goto end_unlink;
-    }
-    inode->i_ctime = CURRENT_TIME_SEC;
-    reiserfs_update_sd (&th, inode);
-
-    dir->i_size -= (de.de_entrylen + DEH_SIZE);
-    dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
-    reiserfs_update_sd (&th, dir);
-
-    if (!savelink)
-       /* prevent file from getting lost */
-       add_save_link (&th, inode, 0/* not truncate */);
-
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-    reiserfs_check_path(&path) ;
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
-
- end_unlink:
-    pathrelse (&path);
-    err = journal_end(&th, dir->i_sb, jbegin_count) ;
-    reiserfs_check_path(&path) ;
-    if (err)
-        retval = err;
-out_unlink:
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+
+	/* we will be doing 2 balancings and update 2 stat data, we change quotas
+	 * of the owner of the directory and of the owner of the parent directory.
+	 * The quota structure is possibly deleted only on last iput => outside
+	 * of this transaction */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_rmdir;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_rmdir;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_rmdir;
+	}
+
+	inode = dentry->d_inode;
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		// FIXME: compare key of an object and a key found in the
+		// entry
+		retval = -EIO;
+		goto end_rmdir;
+	}
+	if (!reiserfs_empty_dir(inode)) {
+		retval = -ENOTEMPTY;
+		goto end_rmdir;
+	}
+
+	/* cut entry from dir directory */
+	retval = reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,	/* page */
+					0 /*new file size - not used here */ );
+	if (retval < 0)
+		goto end_rmdir;
+
+	if (inode->i_nlink != 2 && inode->i_nlink != 1)
+		reiserfs_warning(inode->i_sb, "%s: empty directory has nlink "
+				 "!= 2 (%d)", __FUNCTION__, inode->i_nlink);
+
+	inode->i_nlink = 0;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	DEC_DIR_INODE_NLINK(dir)
+	    dir->i_size -= (DEH_SIZE + de.de_entrylen);
+	reiserfs_update_sd(&th, dir);
+
+	/* prevent empty directory from getting lost */
+	add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
+      out_rmdir:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+
+      end_rmdir:
+	/* we must release path, because we did not call
+	   reiserfs_cut_from_item, or reiserfs_cut_from_item does not
+	   release path if operation was not complete */
+	pathrelse(&path);
+	err = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(dir->i_sb);
+	return err ? err : retval;
 }
 
-static int reiserfs_symlink (struct inode * parent_dir, 
-                            struct dentry * dentry, const char * symname)
+static int reiserfs_unlink(struct inode *dir, struct dentry *dentry)
 {
-    int retval;
-    struct inode * inode;
-    char * name;
-    int item_len;
-    struct reiserfs_transaction_handle th ;
-    int mode = S_IFLNK | S_IRWXUGO;
-    /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb)+REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
-
-    if (!(inode = new_inode(parent_dir->i_sb))) {
-	return -ENOMEM ;
-    }
-    new_inode_init(inode, parent_dir, mode);
-
-    reiserfs_write_lock(parent_dir->i_sb);
-    item_len = ROUND_UP (strlen (symname));
-    if (item_len > MAX_DIRECT_ITEM_LEN (parent_dir->i_sb->s_blocksize)) {
-	retval =  -ENAMETOOLONG;
-	drop_new_inode(inode);
-	goto out_failed;
-    }
-  
-    name = reiserfs_kmalloc (item_len, GFP_NOFS, parent_dir->i_sb);
-    if (!name) {
-	drop_new_inode(inode);
-	retval =  -ENOMEM;
-	goto out_failed;
-    }
-    memcpy (name, symname, strlen (symname));
-    padd_item (name, item_len, strlen (symname));
-
-    /* We would inherit the default ACL here, but symlinks don't get ACLs */
-
-    retval = journal_begin(&th, parent_dir->i_sb, jbegin_count) ;
-    if (retval) {
-        drop_new_inode (inode);
-        reiserfs_kfree (name, item_len, parent_dir->i_sb);
-        goto out_failed;
-    }
-
-    retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), 
-                                 dentry, inode);
-    reiserfs_kfree (name, item_len, parent_dir->i_sb);
-    if (retval) { /* reiserfs_new_inode iputs for us */
-	goto out_failed;
-    }
-
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(parent_dir) ;
-
-    inode->i_op = &reiserfs_symlink_inode_operations;
-    inode->i_mapping->a_ops = &reiserfs_address_space_operations;
-
-    // must be sure this inode is written with this transaction
-    //
-    //reiserfs_update_sd (&th, inode, READ_BLOCKS);
-
-    retval = reiserfs_add_entry (&th, parent_dir, dentry->d_name.name, 
-                                 dentry->d_name.len, inode, 1/*visible*/);
-    if (retval) {
-	int err;
+	int retval, err;
+	struct inode *inode;
+	struct reiserfs_dir_entry de;
+	INITIALIZE_PATH(path);
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	unsigned long savelink;
+
+	inode = dentry->d_inode;
+
+	/* in this transaction we can be doing at max two balancings and update
+	 * two stat datas, we change quotas of the owner of the directory and of
+	 * the owner of the parent directory. The quota structure is possibly
+	 * deleted only on iput => outside of this transaction */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 2 + 2 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	reiserfs_write_lock(dir->i_sb);
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval)
+		goto out_unlink;
+
+	de.de_gen_number_bit_string = NULL;
+	if ((retval =
+	     reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
+				 &path, &de)) == NAME_NOT_FOUND) {
+		retval = -ENOENT;
+		goto end_unlink;
+	} else if (retval == IO_ERROR) {
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (de.de_objectid != inode->i_ino) {
+		// FIXME: compare key of an object and a key found in the
+		// entry
+		retval = -EIO;
+		goto end_unlink;
+	}
+
+	if (!inode->i_nlink) {
+		reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file "
+				 "(%s:%lu), %d", __FUNCTION__,
+				 reiserfs_bdevname(inode->i_sb), inode->i_ino,
+				 inode->i_nlink);
+		inode->i_nlink = 1;
+	}
+
 	inode->i_nlink--;
-	reiserfs_update_sd (&th, inode);
-	err = journal_end(&th, parent_dir->i_sb, jbegin_count) ;
+
+	/*
+	 * we schedule before doing the add_save_link call, save the link
+	 * count so we don't race
+	 */
+	savelink = inode->i_nlink;
+
+	retval =
+	    reiserfs_cut_from_item(&th, &path, &(de.de_entry_key), dir, NULL,
+				   0);
+	if (retval < 0) {
+		inode->i_nlink++;
+		goto end_unlink;
+	}
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	dir->i_size -= (de.de_entrylen + DEH_SIZE);
+	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, dir);
+
+	if (!savelink)
+		/* prevent file from getting lost */
+		add_save_link(&th, inode, 0 /* not truncate */ );
+
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+
+      end_unlink:
+	pathrelse(&path);
+	err = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_check_path(&path);
 	if (err)
-	    retval = err;
-	iput (inode);
-	goto out_failed;
-    }
-
-    d_instantiate(dentry, inode);
-    retval = journal_end(&th, parent_dir->i_sb, jbegin_count) ;
-out_failed:
-    reiserfs_write_unlock(parent_dir->i_sb);
-    return retval;
+		retval = err;
+      out_unlink:
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
 }
 
-static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry)
+static int reiserfs_symlink(struct inode *parent_dir,
+			    struct dentry *dentry, const char *symname)
 {
-    int retval;
-    struct inode *inode = old_dentry->d_inode;
-    struct reiserfs_transaction_handle th ;
-    /* We need blocks for transaction + update of quotas for the owners of the directory */
-    int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
-
-    reiserfs_write_lock(dir->i_sb);
-    if (inode->i_nlink >= REISERFS_LINK_MAX) {
-	//FIXME: sd_nlink is 32 bit for new files
-	reiserfs_write_unlock(dir->i_sb);
-	return -EMLINK;
-    }
-    if (inode->i_nlink == 0) {
-        reiserfs_write_unlock(dir->i_sb);
-        return -ENOENT;
-    }
-
-    /* inc before scheduling so reiserfs_unlink knows we are here */
-    inode->i_nlink++;
-
-    retval = journal_begin(&th, dir->i_sb, jbegin_count) ;
-    if (retval) {
-        inode->i_nlink--;
-        reiserfs_write_unlock (dir->i_sb);
-        return retval;
-    }
-
-    /* create new entry */
-    retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
-				 inode, 1/*visible*/);
-
-    reiserfs_update_inode_transaction(inode) ;
-    reiserfs_update_inode_transaction(dir) ;
-
-    if (retval) {
-	int err;
-	inode->i_nlink--;
-	err = journal_end(&th, dir->i_sb, jbegin_count) ;
-	reiserfs_write_unlock(dir->i_sb);
-	return err ? err : retval;
-    }
+	int retval;
+	struct inode *inode;
+	char *name;
+	int item_len;
+	struct reiserfs_transaction_handle th;
+	int mode = S_IFLNK | S_IRWXUGO;
+	/* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * (REISERFS_QUOTA_INIT_BLOCKS(parent_dir->i_sb) +
+		 REISERFS_QUOTA_TRANS_BLOCKS(parent_dir->i_sb));
+
+	if (!(inode = new_inode(parent_dir->i_sb))) {
+		return -ENOMEM;
+	}
+	new_inode_init(inode, parent_dir, mode);
+
+	reiserfs_write_lock(parent_dir->i_sb);
+	item_len = ROUND_UP(strlen(symname));
+	if (item_len > MAX_DIRECT_ITEM_LEN(parent_dir->i_sb->s_blocksize)) {
+		retval = -ENAMETOOLONG;
+		drop_new_inode(inode);
+		goto out_failed;
+	}
+
+	name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb);
+	if (!name) {
+		drop_new_inode(inode);
+		retval = -ENOMEM;
+		goto out_failed;
+	}
+	memcpy(name, symname, strlen(symname));
+	padd_item(name, item_len, strlen(symname));
+
+	/* We would inherit the default ACL here, but symlinks don't get ACLs */
+
+	retval = journal_begin(&th, parent_dir->i_sb, jbegin_count);
+	if (retval) {
+		drop_new_inode(inode);
+		reiserfs_kfree(name, item_len, parent_dir->i_sb);
+		goto out_failed;
+	}
+
+	retval =
+	    reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname),
+			       dentry, inode);
+	reiserfs_kfree(name, item_len, parent_dir->i_sb);
+	if (retval) {		/* reiserfs_new_inode iputs for us */
+		goto out_failed;
+	}
 
-    inode->i_ctime = CURRENT_TIME_SEC;
-    reiserfs_update_sd (&th, inode);
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(parent_dir);
+
+	inode->i_op = &reiserfs_symlink_inode_operations;
+	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
+
+	// must be sure this inode is written with this transaction
+	//
+	//reiserfs_update_sd (&th, inode, READ_BLOCKS);
+
+	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
+				    dentry->d_name.len, inode, 1 /*visible */ );
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		reiserfs_update_sd(&th, inode);
+		err = journal_end(&th, parent_dir->i_sb, jbegin_count);
+		if (err)
+			retval = err;
+		iput(inode);
+		goto out_failed;
+	}
 
-    atomic_inc(&inode->i_count) ;
-    d_instantiate(dentry, inode);
-    retval = journal_end(&th, dir->i_sb, jbegin_count) ;
-    reiserfs_write_unlock(dir->i_sb);
-    return retval;
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
+      out_failed:
+	reiserfs_write_unlock(parent_dir->i_sb);
+	return retval;
 }
 
+static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
+			 struct dentry *dentry)
+{
+	int retval;
+	struct inode *inode = old_dentry->d_inode;
+	struct reiserfs_transaction_handle th;
+	/* We need blocks for transaction + update of quotas for the owners of the directory */
+	int jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 +
+	    2 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb);
+
+	reiserfs_write_lock(dir->i_sb);
+	if (inode->i_nlink >= REISERFS_LINK_MAX) {
+		//FIXME: sd_nlink is 32 bit for new files
+		reiserfs_write_unlock(dir->i_sb);
+		return -EMLINK;
+	}
+	if (inode->i_nlink == 0) {
+		reiserfs_write_unlock(dir->i_sb);
+		return -ENOENT;
+	}
+
+	/* inc before scheduling so reiserfs_unlink knows we are here */
+	inode->i_nlink++;
+
+	retval = journal_begin(&th, dir->i_sb, jbegin_count);
+	if (retval) {
+		inode->i_nlink--;
+		reiserfs_write_unlock(dir->i_sb);
+		return retval;
+	}
+
+	/* create new entry */
+	retval =
+	    reiserfs_add_entry(&th, dir, dentry->d_name.name,
+			       dentry->d_name.len, inode, 1 /*visible */ );
+
+	reiserfs_update_inode_transaction(inode);
+	reiserfs_update_inode_transaction(dir);
+
+	if (retval) {
+		int err;
+		inode->i_nlink--;
+		err = journal_end(&th, dir->i_sb, jbegin_count);
+		reiserfs_write_unlock(dir->i_sb);
+		return err ? err : retval;
+	}
+
+	inode->i_ctime = CURRENT_TIME_SEC;
+	reiserfs_update_sd(&th, inode);
+
+	atomic_inc(&inode->i_count);
+	d_instantiate(dentry, inode);
+	retval = journal_end(&th, dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(dir->i_sb);
+	return retval;
+}
 
 // de contains information pointing to an entry which 
-static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de)
+static int de_still_valid(const char *name, int len,
+			  struct reiserfs_dir_entry *de)
 {
-    struct reiserfs_dir_entry tmp = *de;
-    
-    // recalculate pointer to name and name length
-    set_de_name_and_namelen (&tmp);
-    // FIXME: could check more
-    if (tmp.de_namelen != len || memcmp (name, de->de_name, len))
-	return 0;
-    return 1;
+	struct reiserfs_dir_entry tmp = *de;
+
+	// recalculate pointer to name and name length
+	set_de_name_and_namelen(&tmp);
+	// FIXME: could check more
+	if (tmp.de_namelen != len || memcmp(name, de->de_name, len))
+		return 0;
+	return 1;
 }
 
-
-static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode)
+static int entry_points_to_object(const char *name, int len,
+				  struct reiserfs_dir_entry *de,
+				  struct inode *inode)
 {
-    if (!de_still_valid (name, len, de))
-	return 0;
-
-    if (inode) {
-	if (!de_visible (de->de_deh + de->de_entry_num))
-	    reiserfs_panic (NULL, "vs-7042: entry_points_to_object: entry must be visible");
-	return (de->de_objectid == inode->i_ino) ? 1 : 0;
-    }
+	if (!de_still_valid(name, len, de))
+		return 0;
+
+	if (inode) {
+		if (!de_visible(de->de_deh + de->de_entry_num))
+			reiserfs_panic(NULL,
+				       "vs-7042: entry_points_to_object: entry must be visible");
+		return (de->de_objectid == inode->i_ino) ? 1 : 0;
+	}
 
-    /* this must be added hidden entry */
-    if (de_visible (de->de_deh + de->de_entry_num))
-	reiserfs_panic (NULL, "vs-7043: entry_points_to_object: entry must be visible");
+	/* this must be added hidden entry */
+	if (de_visible(de->de_deh + de->de_entry_num))
+		reiserfs_panic(NULL,
+			       "vs-7043: entry_points_to_object: entry must be visible");
 
-    return 1;
+	return 1;
 }
 
-
 /* sets key of objectid the entry has to point to */
-static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct reiserfs_key * key)
+static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de,
+				 struct reiserfs_key *key)
 {
-    /* JDM These operations are endian safe - both are le */
-    de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
-    de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
+	/* JDM These operations are endian safe - both are le */
+	de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id;
+	de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid;
 }
 
-
 /* 
  * process, that is going to call fix_nodes/do_balance must hold only
  * one path. If it holds 2 or more, it can get into endless waiting in
  * get_empty_nodes or its clones 
  */
-static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
-			    struct inode * new_dir, struct dentry *new_dentry)
+static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+			   struct inode *new_dir, struct dentry *new_dentry)
 {
-    int retval;
-    INITIALIZE_PATH (old_entry_path);
-    INITIALIZE_PATH (new_entry_path);
-    INITIALIZE_PATH (dot_dot_entry_path);
-    struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ;
-    struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
-    struct inode * old_inode, * new_dentry_inode;
-    struct reiserfs_transaction_handle th ;
-    int jbegin_count ; 
-    umode_t old_inode_mode;
-    unsigned long savelink = 1;
-    struct timespec ctime;
-
-    /* three balancings: (1) old name removal, (2) new name insertion
-       and (3) maybe "save" link insertion
-       stat data updates: (1) old directory,
-       (2) new directory and (3) maybe old object stat data (when it is
-       directory) and (4) maybe stat data of object to which new entry
-       pointed initially and (5) maybe block containing ".." of
-       renamed directory
-       quota updates: two parent directories */
-    jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 5 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
-
-    old_inode = old_dentry->d_inode;
-    new_dentry_inode = new_dentry->d_inode;
-
-    // make sure, that oldname still exists and points to an object we
-    // are going to rename
-    old_de.de_gen_number_bit_string = NULL;
-    reiserfs_write_lock(old_dir->i_sb);
-    retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len,
-				  &old_entry_path, &old_de);
-    pathrelse (&old_entry_path);
-    if (retval == IO_ERROR) {
-	reiserfs_write_unlock(old_dir->i_sb);
-	return -EIO;
-    }
-
-    if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
-	reiserfs_write_unlock(old_dir->i_sb);
-	return -ENOENT;
-    }
-
-    old_inode_mode = old_inode->i_mode;
-    if (S_ISDIR(old_inode_mode)) {
-	// make sure, that directory being renamed has correct ".." 
-	// and that its new parent directory has not too many links
-	// already
-
-	if (new_dentry_inode) {
-	    if (!reiserfs_empty_dir(new_dentry_inode)) {
+	int retval;
+	INITIALIZE_PATH(old_entry_path);
+	INITIALIZE_PATH(new_entry_path);
+	INITIALIZE_PATH(dot_dot_entry_path);
+	struct item_head new_entry_ih, old_entry_ih, dot_dot_ih;
+	struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
+	struct inode *old_inode, *new_dentry_inode;
+	struct reiserfs_transaction_handle th;
+	int jbegin_count;
+	umode_t old_inode_mode;
+	unsigned long savelink = 1;
+	struct timespec ctime;
+
+	/* three balancings: (1) old name removal, (2) new name insertion
+	   and (3) maybe "save" link insertion
+	   stat data updates: (1) old directory,
+	   (2) new directory and (3) maybe old object stat data (when it is
+	   directory) and (4) maybe stat data of object to which new entry
+	   pointed initially and (5) maybe block containing ".." of
+	   renamed directory
+	   quota updates: two parent directories */
+	jbegin_count =
+	    JOURNAL_PER_BALANCE_CNT * 3 + 5 +
+	    4 * REISERFS_QUOTA_TRANS_BLOCKS(old_dir->i_sb);
+
+	old_inode = old_dentry->d_inode;
+	new_dentry_inode = new_dentry->d_inode;
+
+	// make sure, that oldname still exists and points to an object we
+	// are going to rename
+	old_de.de_gen_number_bit_string = NULL;
+	reiserfs_write_lock(old_dir->i_sb);
+	retval =
+	    reiserfs_find_entry(old_dir, old_dentry->d_name.name,
+				old_dentry->d_name.len, &old_entry_path,
+				&old_de);
+	pathrelse(&old_entry_path);
+	if (retval == IO_ERROR) {
 		reiserfs_write_unlock(old_dir->i_sb);
-		return -ENOTEMPTY;
-	    }
+		return -EIO;
 	}
-	
-	/* directory is renamed, its parent directory will be changed, 
-	** so find ".." entry 
-	*/
-	dot_dot_de.de_gen_number_bit_string = NULL;
-	retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de);
-	pathrelse (&dot_dot_entry_path);
-	if (retval != NAME_FOUND) {
-	    reiserfs_write_unlock(old_dir->i_sb);
-	    return -EIO;
+
+	if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return -ENOENT;
 	}
 
-	/* inode number of .. must equal old_dir->i_ino */
-	if (dot_dot_de.de_objectid != old_dir->i_ino) {
-	    reiserfs_write_unlock(old_dir->i_sb);
-	    return -EIO;
+	old_inode_mode = old_inode->i_mode;
+	if (S_ISDIR(old_inode_mode)) {
+		// make sure, that directory being renamed has correct ".." 
+		// and that its new parent directory has not too many links
+		// already
+
+		if (new_dentry_inode) {
+			if (!reiserfs_empty_dir(new_dentry_inode)) {
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -ENOTEMPTY;
+			}
+		}
+
+		/* directory is renamed, its parent directory will be changed, 
+		 ** so find ".." entry 
+		 */
+		dot_dot_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path,
+					&dot_dot_de);
+		pathrelse(&dot_dot_entry_path);
+		if (retval != NAME_FOUND) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		/* inode number of .. must equal old_dir->i_ino */
+		if (dot_dot_de.de_objectid != old_dir->i_ino) {
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
 	}
-    }
-
-    retval = journal_begin(&th, old_dir->i_sb, jbegin_count) ;
-    if (retval) {
-        reiserfs_write_unlock (old_dir->i_sb);
-        return retval;
-    }
-
-    /* add new entry (or find the existing one) */
-    retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, 
-				 old_inode, 0);
-    if (retval == -EEXIST) {
-	if (!new_dentry_inode) {
-	    reiserfs_panic (old_dir->i_sb,
-			    "vs-7050: new entry is found, new inode == 0\n");
+
+	retval = journal_begin(&th, old_dir->i_sb, jbegin_count);
+	if (retval) {
+		reiserfs_write_unlock(old_dir->i_sb);
+		return retval;
 	}
-    } else if (retval) {
-	int err = journal_end(&th, old_dir->i_sb, jbegin_count) ;
-	reiserfs_write_unlock(old_dir->i_sb);
-	return err ? err : retval;
-    }
-
-    reiserfs_update_inode_transaction(old_dir) ;
-    reiserfs_update_inode_transaction(new_dir) ;
-
-    /* this makes it so an fsync on an open fd for the old name will
-    ** commit the rename operation
-    */
-    reiserfs_update_inode_transaction(old_inode) ;
-
-    if (new_dentry_inode) 
-	reiserfs_update_inode_transaction(new_dentry_inode) ;
-
-    while (1) {
-	// look for old name using corresponding entry key (found by reiserfs_find_entry)
-	if ((retval = search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key,
-					   &old_entry_path, &old_de)) != NAME_FOUND) {
-	    pathrelse(&old_entry_path);
-	    journal_end(&th, old_dir->i_sb, jbegin_count);
-	    reiserfs_write_unlock(old_dir->i_sb);
-	    return -EIO;
+
+	/* add new entry (or find the existing one) */
+	retval =
+	    reiserfs_add_entry(&th, new_dir, new_dentry->d_name.name,
+			       new_dentry->d_name.len, old_inode, 0);
+	if (retval == -EEXIST) {
+		if (!new_dentry_inode) {
+			reiserfs_panic(old_dir->i_sb,
+				       "vs-7050: new entry is found, new inode == 0\n");
+		}
+	} else if (retval) {
+		int err = journal_end(&th, old_dir->i_sb, jbegin_count);
+		reiserfs_write_unlock(old_dir->i_sb);
+		return err ? err : retval;
 	}
 
-	copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ;
-
-	reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1) ;
-
-	// look for new name by reiserfs_find_entry
-	new_de.de_gen_number_bit_string = NULL;
-	retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len, 
-				      &new_entry_path, &new_de);
-	// reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
-        // reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
-	if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
-	    pathrelse(&new_entry_path);
-	    pathrelse(&old_entry_path);
-	    journal_end(&th, old_dir->i_sb, jbegin_count);
-	    reiserfs_write_unlock(old_dir->i_sb);
-	    return -EIO;
+	reiserfs_update_inode_transaction(old_dir);
+	reiserfs_update_inode_transaction(new_dir);
+
+	/* this makes it so an fsync on an open fd for the old name will
+	 ** commit the rename operation
+	 */
+	reiserfs_update_inode_transaction(old_inode);
+
+	if (new_dentry_inode)
+		reiserfs_update_inode_transaction(new_dentry_inode);
+
+	while (1) {
+		// look for old name using corresponding entry key (found by reiserfs_find_entry)
+		if ((retval =
+		     search_by_entry_key(new_dir->i_sb, &old_de.de_entry_key,
+					 &old_entry_path,
+					 &old_de)) != NAME_FOUND) {
+			pathrelse(&old_entry_path);
+			journal_end(&th, old_dir->i_sb, jbegin_count);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&old_entry_ih, get_ih(&old_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, old_de.de_bh, 1);
+
+		// look for new name by reiserfs_find_entry
+		new_de.de_gen_number_bit_string = NULL;
+		retval =
+		    reiserfs_find_entry(new_dir, new_dentry->d_name.name,
+					new_dentry->d_name.len, &new_entry_path,
+					&new_de);
+		// reiserfs_add_entry should not return IO_ERROR, because it is called with essentially same parameters from
+		// reiserfs_add_entry above, and we'll catch any i/o errors before we get here.
+		if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) {
+			pathrelse(&new_entry_path);
+			pathrelse(&old_entry_path);
+			journal_end(&th, old_dir->i_sb, jbegin_count);
+			reiserfs_write_unlock(old_dir->i_sb);
+			return -EIO;
+		}
+
+		copy_item_head(&new_entry_ih, get_ih(&new_entry_path));
+
+		reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
+
+		if (S_ISDIR(old_inode->i_mode)) {
+			if ((retval =
+			     search_by_entry_key(new_dir->i_sb,
+						 &dot_dot_de.de_entry_key,
+						 &dot_dot_entry_path,
+						 &dot_dot_de)) != NAME_FOUND) {
+				pathrelse(&dot_dot_entry_path);
+				pathrelse(&new_entry_path);
+				pathrelse(&old_entry_path);
+				journal_end(&th, old_dir->i_sb, jbegin_count);
+				reiserfs_write_unlock(old_dir->i_sb);
+				return -EIO;
+			}
+			copy_item_head(&dot_dot_ih,
+				       get_ih(&dot_dot_entry_path));
+			// node containing ".." gets into transaction
+			reiserfs_prepare_for_journal(old_inode->i_sb,
+						     dot_dot_de.de_bh, 1);
+		}
+		/* we should check seals here, not do
+		   this stuff, yes? Then, having
+		   gathered everything into RAM we
+		   should lock the buffers, yes?  -Hans */
+		/* probably.  our rename needs to hold more 
+		 ** than one path at once.  The seals would 
+		 ** have to be written to deal with multi-path 
+		 ** issues -chris
+		 */
+		/* sanity checking before doing the rename - avoid races many
+		 ** of the above checks could have scheduled.  We have to be
+		 ** sure our items haven't been shifted by another process.
+		 */
+		if (item_moved(&new_entry_ih, &new_entry_path) ||
+		    !entry_points_to_object(new_dentry->d_name.name,
+					    new_dentry->d_name.len,
+					    &new_de, new_dentry_inode) ||
+		    item_moved(&old_entry_ih, &old_entry_path) ||
+		    !entry_points_to_object(old_dentry->d_name.name,
+					    old_dentry->d_name.len,
+					    &old_de, old_inode)) {
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 new_de.de_bh);
+			reiserfs_restore_prepared_buffer(old_inode->i_sb,
+							 old_de.de_bh);
+			if (S_ISDIR(old_inode_mode))
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+			continue;
+		}
+		if (S_ISDIR(old_inode_mode)) {
+			if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
+			    !entry_points_to_object("..", 2, &dot_dot_de,
+						    old_dir)) {
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 old_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 new_de.de_bh);
+				reiserfs_restore_prepared_buffer(old_inode->
+								 i_sb,
+								 dot_dot_de.
+								 de_bh);
+				continue;
+			}
+		}
+
+		RFALSE(S_ISDIR(old_inode_mode) &&
+		       !buffer_journal_prepared(dot_dot_de.de_bh), "");
+
+		break;
 	}
 
-	copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ;
+	/* ok, all the changes can be done in one fell swoop when we
+	   have claimed all the buffers needed. */
 
-	reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ;
+	mark_de_visible(new_de.de_deh + new_de.de_entry_num);
+	set_ino_in_dir_entry(&new_de, INODE_PKEY(old_inode));
+	journal_mark_dirty(&th, old_dir->i_sb, new_de.de_bh);
 
-	if (S_ISDIR(old_inode->i_mode)) {
-	    if ((retval = search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key,
-					       &dot_dot_entry_path, &dot_dot_de)) != NAME_FOUND) {
-		pathrelse(&dot_dot_entry_path);
-		pathrelse(&new_entry_path);
-		pathrelse(&old_entry_path);
-		journal_end(&th, old_dir->i_sb, jbegin_count);
-		reiserfs_write_unlock(old_dir->i_sb);
-		return -EIO;
-	    }
-	    copy_item_head(&dot_dot_ih, get_ih(&dot_dot_entry_path)) ;
-	    // node containing ".." gets into transaction
-	    reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ;
-	}
-				/* we should check seals here, not do
-                                   this stuff, yes? Then, having
-                                   gathered everything into RAM we
-                                   should lock the buffers, yes?  -Hans */
-				/* probably.  our rename needs to hold more 
-				** than one path at once.  The seals would 
-				** have to be written to deal with multi-path 
-				** issues -chris
-				*/
-	/* sanity checking before doing the rename - avoid races many
-	** of the above checks could have scheduled.  We have to be
-	** sure our items haven't been shifted by another process.
-	*/
-	if (item_moved(&new_entry_ih, &new_entry_path) ||
-	    !entry_points_to_object(new_dentry->d_name.name, 
-	                            new_dentry->d_name.len,
-				    &new_de, new_dentry_inode) ||
-	    item_moved(&old_entry_ih, &old_entry_path) || 
-	    !entry_points_to_object (old_dentry->d_name.name, 
-	                             old_dentry->d_name.len,
-				     &old_de, old_inode)) {
-	    reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh);
-	    reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh);
-	    if (S_ISDIR(old_inode_mode))
-		reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh);
-	    continue;
+	mark_de_hidden(old_de.de_deh + old_de.de_entry_num);
+	journal_mark_dirty(&th, old_dir->i_sb, old_de.de_bh);
+	ctime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	/* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
+	   renamed object */
+	old_inode->i_ctime = ctime;
+
+	if (new_dentry_inode) {
+		// adjust link number of the victim
+		if (S_ISDIR(new_dentry_inode->i_mode)) {
+			new_dentry_inode->i_nlink = 0;
+		} else {
+			new_dentry_inode->i_nlink--;
+		}
+		new_dentry_inode->i_ctime = ctime;
+		savelink = new_dentry_inode->i_nlink;
 	}
+
 	if (S_ISDIR(old_inode_mode)) {
-	    if ( item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
-		!entry_points_to_object ( "..", 2, &dot_dot_de, old_dir) ) {
-		reiserfs_restore_prepared_buffer (old_inode->i_sb, old_de.de_bh);
-		reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh);
-		reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh);
-		continue;
-	    }
+		// adjust ".." of renamed directory 
+		set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
+		journal_mark_dirty(&th, new_dir->i_sb, dot_dot_de.de_bh);
+
+		if (!new_dentry_inode)
+			/* there (in new_dir) was no directory, so it got new link
+			   (".."  of renamed directory) */
+			INC_DIR_INODE_NLINK(new_dir);
+
+		/* old directory lost one link - ".. " of renamed directory */
+		DEC_DIR_INODE_NLINK(old_dir);
 	}
+	// looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
+	pathrelse(&new_entry_path);
+	pathrelse(&dot_dot_entry_path);
 
-	RFALSE( S_ISDIR(old_inode_mode) && 
-		 !buffer_journal_prepared(dot_dot_de.de_bh), "" );
-
-	break;
-    }
-
-    /* ok, all the changes can be done in one fell swoop when we
-       have claimed all the buffers needed.*/
-    
-    mark_de_visible (new_de.de_deh + new_de.de_entry_num);
-    set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode));
-    journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh);
-
-    mark_de_hidden (old_de.de_deh + old_de.de_entry_num);
-    journal_mark_dirty (&th, old_dir->i_sb, old_de.de_bh);
-    ctime = CURRENT_TIME_SEC;
-    old_dir->i_ctime = old_dir->i_mtime = ctime;
-    new_dir->i_ctime = new_dir->i_mtime = ctime;
-    /* thanks to Alex Adriaanse <alex_a@caltech.edu> for patch which adds ctime update of
-       renamed object */
-    old_inode->i_ctime = ctime;
-
-    if (new_dentry_inode) {
-	// adjust link number of the victim
-	if (S_ISDIR(new_dentry_inode->i_mode)) {
-	    new_dentry_inode->i_nlink  = 0;
-	} else {
-	    new_dentry_inode->i_nlink--;
+	// FIXME: this reiserfs_cut_from_item's return value may screw up
+	// anybody, but it will panic if will not be able to find the
+	// entry. This needs one more clean up
+	if (reiserfs_cut_from_item
+	    (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL,
+	     0) < 0)
+		reiserfs_warning(old_dir->i_sb,
+				 "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?");
+
+	old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
+
+	reiserfs_update_sd(&th, old_dir);
+	reiserfs_update_sd(&th, new_dir);
+	reiserfs_update_sd(&th, old_inode);
+
+	if (new_dentry_inode) {
+		if (savelink == 0)
+			add_save_link(&th, new_dentry_inode,
+				      0 /* not truncate */ );
+		reiserfs_update_sd(&th, new_dentry_inode);
 	}
-	new_dentry_inode->i_ctime = ctime;
-	savelink = new_dentry_inode->i_nlink;
-    }
-
-    if (S_ISDIR(old_inode_mode)) {
-	// adjust ".." of renamed directory 
-	set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir));
-	journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh);
-	
-        if (!new_dentry_inode)
-	    /* there (in new_dir) was no directory, so it got new link
-	       (".."  of renamed directory) */
-	    INC_DIR_INODE_NLINK(new_dir);
-		
-	/* old directory lost one link - ".. " of renamed directory */
-	DEC_DIR_INODE_NLINK(old_dir);
-    }
-
-    // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse
-    pathrelse (&new_entry_path);
-    pathrelse (&dot_dot_entry_path);
-
-    // FIXME: this reiserfs_cut_from_item's return value may screw up
-    // anybody, but it will panic if will not be able to find the
-    // entry. This needs one more clean up
-    if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0)
-	reiserfs_warning (old_dir->i_sb, "vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?");
-
-    old_dir->i_size -= DEH_SIZE + old_de.de_entrylen;
-
-    reiserfs_update_sd (&th, old_dir);
-    reiserfs_update_sd (&th, new_dir);
-    reiserfs_update_sd (&th, old_inode);
-
-    if (new_dentry_inode) {
-	if (savelink == 0)
-	    add_save_link (&th, new_dentry_inode, 0/* not truncate */);
-	reiserfs_update_sd (&th, new_dentry_inode);
-    }
-
-    retval = journal_end(&th, old_dir->i_sb, jbegin_count) ;
-    reiserfs_write_unlock(old_dir->i_sb);
-    return retval;
+
+	retval = journal_end(&th, old_dir->i_sb, jbegin_count);
+	reiserfs_write_unlock(old_dir->i_sb);
+	return retval;
 }
 
 /*
  * directories can handle most operations...
  */
 struct inode_operations reiserfs_dir_inode_operations = {
-  //&reiserfs_dir_operations,	/* default_file_ops */
-    .create	= reiserfs_create,
-    .lookup	= reiserfs_lookup,
-    .link	= reiserfs_link,
-    .unlink	= reiserfs_unlink,
-    .symlink	= reiserfs_symlink,
-    .mkdir	= reiserfs_mkdir,
-    .rmdir	= reiserfs_rmdir,
-    .mknod	= reiserfs_mknod,
-    .rename	= reiserfs_rename,
-    .setattr    = reiserfs_setattr,
-    .setxattr   = reiserfs_setxattr,
-    .getxattr   = reiserfs_getxattr,
-    .listxattr  = reiserfs_listxattr,
-    .removexattr = reiserfs_removexattr,
-    .permission     = reiserfs_permission,
+	//&reiserfs_dir_operations,   /* default_file_ops */
+	.create = reiserfs_create,
+	.lookup = reiserfs_lookup,
+	.link = reiserfs_link,
+	.unlink = reiserfs_unlink,
+	.symlink = reiserfs_symlink,
+	.mkdir = reiserfs_mkdir,
+	.rmdir = reiserfs_rmdir,
+	.mknod = reiserfs_mknod,
+	.rename = reiserfs_rename,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
 };
 
 /*
@@ -1467,28 +1554,27 @@ struct inode_operations reiserfs_dir_inode_operations = {
  * stuff added
  */
 struct inode_operations reiserfs_symlink_inode_operations = {
-    .readlink       = generic_readlink,
-    .follow_link    = page_follow_link_light,
-    .put_link       = page_put_link,
-    .setattr        = reiserfs_setattr,
-    .setxattr       = reiserfs_setxattr,
-    .getxattr       = reiserfs_getxattr,
-    .listxattr      = reiserfs_listxattr,
-    .removexattr    = reiserfs_removexattr,
-    .permission     = reiserfs_permission,
+	.readlink = generic_readlink,
+	.follow_link = page_follow_link_light,
+	.put_link = page_put_link,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
 
 };
 
-
 /*
  * special file operations.. just xattr/acl stuff
  */
 struct inode_operations reiserfs_special_inode_operations = {
-    .setattr        = reiserfs_setattr,
-    .setxattr       = reiserfs_setxattr,
-    .getxattr       = reiserfs_getxattr,
-    .listxattr      = reiserfs_listxattr,
-    .removexattr    = reiserfs_removexattr,
-    .permission     = reiserfs_permission,
+	.setattr = reiserfs_setattr,
+	.setxattr = reiserfs_setxattr,
+	.getxattr = reiserfs_getxattr,
+	.listxattr = reiserfs_listxattr,
+	.removexattr = reiserfs_removexattr,
+	.permission = reiserfs_permission,
 
 };
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index bfe8e25ef293..f62590aa9c95 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -14,24 +14,24 @@
                          (__le32 *)((struct reiserfs_super_block_v1 *)(rs) + 1) :\
 			 (__le32 *)((rs) + 1))
 
-
 #ifdef CONFIG_REISERFS_CHECK
 
-static void check_objectid_map (struct super_block * s, __le32 * map)
+static void check_objectid_map(struct super_block *s, __le32 * map)
 {
-    if (le32_to_cpu (map[0]) != 1)
-	reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted: %lx",
-			( long unsigned int ) le32_to_cpu (map[0]));
+	if (le32_to_cpu(map[0]) != 1)
+		reiserfs_panic(s,
+			       "vs-15010: check_objectid_map: map corrupted: %lx",
+			       (long unsigned int)le32_to_cpu(map[0]));
 
-    // FIXME: add something else here
+	// FIXME: add something else here
 }
 
 #else
-static void check_objectid_map (struct super_block * s, __le32 * map)
-{;}
+static void check_objectid_map(struct super_block *s, __le32 * map)
+{;
+}
 #endif
 
-
 /* When we allocate objectids we allocate the first unused objectid.
    Each sequence of objectids in use (the odd sequences) is followed
    by a sequence of objectids not in use (the even sequences).  We
@@ -46,161 +46,162 @@ static void check_objectid_map (struct super_block * s, __le32 * map)
    interesting optimizations of layout could result from complicating
    objectid assignment, but we have deferred making them for now. */
 
-
 /* get unique object identifier */
-__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th)
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th)
 {
-    struct super_block * s = th->t_super;
-    struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
-    __le32 * map = objectid_map (s, rs);
-    __u32 unused_objectid;
-
-    BUG_ON (!th->t_trans_id);
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	__u32 unused_objectid;
+
+	BUG_ON(!th->t_trans_id);
+
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	/* comment needed -Hans */
+	unused_objectid = le32_to_cpu(map[1]);
+	if (unused_objectid == U32_MAX) {
+		reiserfs_warning(s, "%s: no more object ids", __FUNCTION__);
+		reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s));
+		return 0;
+	}
 
-    check_objectid_map (s, map);
+	/* This incrementation allocates the first unused objectid. That
+	   is to say, the first entry on the objectid map is the first
+	   unused objectid, and by incrementing it we use it.  See below
+	   where we check to see if we eliminated a sequence of unused
+	   objectids.... */
+	map[1] = cpu_to_le32(unused_objectid + 1);
+
+	/* Now we check to see if we eliminated the last remaining member of
+	   the first even sequence (and can eliminate the sequence by
+	   eliminating its last objectid from oids), and can collapse the
+	   first two odd sequences into one sequence.  If so, then the net
+	   result is to eliminate a pair of objectids from oids.  We do this
+	   by shifting the entire map to the left. */
+	if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
+		memmove(map + 1, map + 3,
+			(sb_oid_cursize(rs) - 3) * sizeof(__u32));
+		set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+	}
 
-    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-                                /* comment needed -Hans */
-    unused_objectid = le32_to_cpu (map[1]);
-    if (unused_objectid == U32_MAX) {
-	reiserfs_warning (s, "%s: no more object ids", __FUNCTION__);
-	reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ;
-	return 0;
-    }
-
-    /* This incrementation allocates the first unused objectid. That
-       is to say, the first entry on the objectid map is the first
-       unused objectid, and by incrementing it we use it.  See below
-       where we check to see if we eliminated a sequence of unused
-       objectids.... */
-    map[1] = cpu_to_le32 (unused_objectid + 1);
-
-    /* Now we check to see if we eliminated the last remaining member of
-       the first even sequence (and can eliminate the sequence by
-       eliminating its last objectid from oids), and can collapse the
-       first two odd sequences into one sequence.  If so, then the net
-       result is to eliminate a pair of objectids from oids.  We do this
-       by shifting the entire map to the left. */
-    if (sb_oid_cursize(rs) > 2 && map[1] == map[2]) {
-	memmove (map + 1, map + 3, (sb_oid_cursize(rs) - 3) * sizeof(__u32));
-        set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 );
-    }
-
-    journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    return unused_objectid;
+	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+	return unused_objectid;
 }
 
-
 /* makes object identifier unused */
-void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, 
-				__u32 objectid_to_release)
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+			       __u32 objectid_to_release)
 {
-    struct super_block * s = th->t_super;
-    struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
-    __le32 * map = objectid_map (s, rs);
-    int i = 0;
-
-    BUG_ON (!th->t_trans_id);
-    //return;
-    check_objectid_map (s, map);
-
-    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-    journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-
-    /* start at the beginning of the objectid map (i = 0) and go to
-       the end of it (i = disk_sb->s_oid_cursize).  Linear search is
-       what we use, though it is possible that binary search would be
-       more efficient after performing lots of deletions (which is
-       when oids is large.)  We only check even i's. */
-    while (i < sb_oid_cursize(rs)) {
-	if (objectid_to_release == le32_to_cpu (map[i])) {
-	    /* This incrementation unallocates the objectid. */
-	    //map[i]++;
-	    map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1);
-
-	    /* Did we unallocate the last member of an odd sequence, and can shrink oids? */
-	    if (map[i] == map[i+1]) {
-		/* shrink objectid map */
-		memmove (map + i, map + i + 2, 
-			 (sb_oid_cursize(rs) - i - 2) * sizeof (__u32));
-		//disk_sb->s_oid_cursize -= 2;
-                set_sb_oid_cursize( rs, sb_oid_cursize(rs) - 2 );
-
-		RFALSE( sb_oid_cursize(rs) < 2 || 
-		        sb_oid_cursize(rs) > sb_oid_maxsize(rs),
-		        "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
-                        sb_oid_cursize(rs), sb_oid_maxsize(rs));
-	    }
-	    return;
+	struct super_block *s = th->t_super;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	__le32 *map = objectid_map(s, rs);
+	int i = 0;
+
+	BUG_ON(!th->t_trans_id);
+	//return;
+	check_objectid_map(s, map);
+
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+	journal_mark_dirty(th, s, SB_BUFFER_WITH_SB(s));
+
+	/* start at the beginning of the objectid map (i = 0) and go to
+	   the end of it (i = disk_sb->s_oid_cursize).  Linear search is
+	   what we use, though it is possible that binary search would be
+	   more efficient after performing lots of deletions (which is
+	   when oids is large.)  We only check even i's. */
+	while (i < sb_oid_cursize(rs)) {
+		if (objectid_to_release == le32_to_cpu(map[i])) {
+			/* This incrementation unallocates the objectid. */
+			//map[i]++;
+			map[i] = cpu_to_le32(le32_to_cpu(map[i]) + 1);
+
+			/* Did we unallocate the last member of an odd sequence, and can shrink oids? */
+			if (map[i] == map[i + 1]) {
+				/* shrink objectid map */
+				memmove(map + i, map + i + 2,
+					(sb_oid_cursize(rs) - i -
+					 2) * sizeof(__u32));
+				//disk_sb->s_oid_cursize -= 2;
+				set_sb_oid_cursize(rs, sb_oid_cursize(rs) - 2);
+
+				RFALSE(sb_oid_cursize(rs) < 2 ||
+				       sb_oid_cursize(rs) > sb_oid_maxsize(rs),
+				       "vs-15005: objectid map corrupted cur_size == %d (max == %d)",
+				       sb_oid_cursize(rs), sb_oid_maxsize(rs));
+			}
+			return;
+		}
+
+		if (objectid_to_release > le32_to_cpu(map[i]) &&
+		    objectid_to_release < le32_to_cpu(map[i + 1])) {
+			/* size of objectid map is not changed */
+			if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) {
+				//objectid_map[i+1]--;
+				map[i + 1] =
+				    cpu_to_le32(le32_to_cpu(map[i + 1]) - 1);
+				return;
+			}
+
+			/* JDM comparing two little-endian values for equality -- safe */
+			if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
+				/* objectid map must be expanded, but there is no space */
+				PROC_INFO_INC(s, leaked_oid);
+				return;
+			}
+
+			/* expand the objectid map */
+			memmove(map + i + 3, map + i + 1,
+				(sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
+			map[i + 1] = cpu_to_le32(objectid_to_release);
+			map[i + 2] = cpu_to_le32(objectid_to_release + 1);
+			set_sb_oid_cursize(rs, sb_oid_cursize(rs) + 2);
+			return;
+		}
+		i += 2;
 	}
 
-	if (objectid_to_release > le32_to_cpu (map[i]) && 
-	    objectid_to_release < le32_to_cpu (map[i + 1])) {
-	    /* size of objectid map is not changed */
-	    if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) {
-		//objectid_map[i+1]--;
-		map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1);
-		return;
-	    }
-
-            /* JDM comparing two little-endian values for equality -- safe */
-	if (sb_oid_cursize(rs) == sb_oid_maxsize(rs)) {
-		/* objectid map must be expanded, but there is no space */
-		PROC_INFO_INC( s, leaked_oid );
-		return;
-	}
+	reiserfs_warning(s,
+			 "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
+			 (long unsigned)objectid_to_release);
+}
 
-	    /* expand the objectid map*/
-	    memmove (map + i + 3, map + i + 1, 
-		     (sb_oid_cursize(rs) - i - 1) * sizeof(__u32));
-	    map[i + 1] = cpu_to_le32 (objectid_to_release);
-	    map[i + 2] = cpu_to_le32 (objectid_to_release + 1);
-            set_sb_oid_cursize( rs, sb_oid_cursize(rs) + 2 );
-	    return;
+int reiserfs_convert_objectid_map_v1(struct super_block *s)
+{
+	struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK(s);
+	int cur_size = sb_oid_cursize(disk_sb);
+	int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2;
+	int old_max = sb_oid_maxsize(disk_sb);
+	struct reiserfs_super_block_v1 *disk_sb_v1;
+	__le32 *objectid_map, *new_objectid_map;
+	int i;
+
+	disk_sb_v1 =
+	    (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
+	objectid_map = (__le32 *) (disk_sb_v1 + 1);
+	new_objectid_map = (__le32 *) (disk_sb + 1);
+
+	if (cur_size > new_size) {
+		/* mark everyone used that was listed as free at the end of the objectid
+		 ** map 
+		 */
+		objectid_map[new_size - 1] = objectid_map[cur_size - 1];
+		set_sb_oid_cursize(disk_sb, new_size);
+	}
+	/* move the smaller objectid map past the end of the new super */
+	for (i = new_size - 1; i >= 0; i--) {
+		objectid_map[i + (old_max - new_size)] = objectid_map[i];
 	}
-	i += 2;
-    }
 
-    reiserfs_warning (s, "vs-15011: reiserfs_release_objectid: tried to free free object id (%lu)",
-		      ( long unsigned ) objectid_to_release);
-}
+	/* set the max size so we don't overflow later */
+	set_sb_oid_maxsize(disk_sb, new_size);
 
+	/* Zero out label and generate random UUID */
+	memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label));
+	generate_random_uuid(disk_sb->s_uuid);
 
-int reiserfs_convert_objectid_map_v1(struct super_block *s) {
-    struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s);
-    int cur_size = sb_oid_cursize(disk_sb);
-    int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ;
-    int old_max = sb_oid_maxsize(disk_sb);
-    struct reiserfs_super_block_v1 *disk_sb_v1 ;
-    __le32 *objectid_map, *new_objectid_map ;
-    int i ;
-
-    disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data);
-    objectid_map = (__le32 *)(disk_sb_v1 + 1) ;
-    new_objectid_map = (__le32 *)(disk_sb + 1) ;
-
-    if (cur_size > new_size) {
-	/* mark everyone used that was listed as free at the end of the objectid
-	** map 
-	*/
-	objectid_map[new_size - 1] = objectid_map[cur_size - 1] ;
-	set_sb_oid_cursize(disk_sb,new_size) ;
-    }
-    /* move the smaller objectid map past the end of the new super */
-    for (i = new_size - 1 ; i >= 0 ; i--) {
-        objectid_map[i + (old_max - new_size)] = objectid_map[i] ; 
-    }
-
-
-    /* set the max size so we don't overflow later */
-    set_sb_oid_maxsize(disk_sb,new_size) ;
-
-    /* Zero out label and generate random UUID */
-    memset(disk_sb->s_label, 0, sizeof(disk_sb->s_label)) ;
-    generate_random_uuid(disk_sb->s_uuid);
-
-    /* finally, zero out the unused chunk of the new super */
-    memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ;
-    return 0 ;
+	/* finally, zero out the unused chunk of the new super */
+	memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused));
+	return 0;
 }
-
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 16fdca1d4bd7..d55e164bd5c2 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -15,168 +15,166 @@ static char error_buf[1024];
 static char fmt_buf[1024];
 static char off_buf[80];
 
-
-static char * reiserfs_cpu_offset (struct cpu_key * key)
+static char *reiserfs_cpu_offset(struct cpu_key *key)
 {
-  if (cpu_key_k_type(key) == TYPE_DIRENTRY)
-    sprintf (off_buf, "%Lu(%Lu)", 
-	     (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)),
-	     (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key)));
-  else
-    sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key));
-  return off_buf;
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%Lu(%Lu)",
+			(unsigned long long)
+			GET_HASH_VALUE(cpu_key_k_offset(key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(cpu_key_k_offset(key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)cpu_key_k_offset(key));
+	return off_buf;
 }
 
-
-static char * le_offset (struct reiserfs_key * key)
+static char *le_offset(struct reiserfs_key *key)
 {
-  int version;
+	int version;
 
-  version = le_key_version (key);
-  if (le_key_k_type (version, key) == TYPE_DIRENTRY)
-    sprintf (off_buf, "%Lu(%Lu)", 
-	     (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)),
-	     (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key)));
-  else
-    sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key));
-  return off_buf;
+	version = le_key_version(key);
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		sprintf(off_buf, "%Lu(%Lu)",
+			(unsigned long long)
+			GET_HASH_VALUE(le_key_k_offset(version, key)),
+			(unsigned long long)
+			GET_GENERATION_NUMBER(le_key_k_offset(version, key)));
+	else
+		sprintf(off_buf, "0x%Lx",
+			(unsigned long long)le_key_k_offset(version, key));
+	return off_buf;
 }
 
-
-static char * cpu_type (struct cpu_key * key)
+static char *cpu_type(struct cpu_key *key)
 {
-    if (cpu_key_k_type (key) == TYPE_STAT_DATA)
-	return "SD";
-    if (cpu_key_k_type (key) == TYPE_DIRENTRY)
-	return "DIR";
-    if (cpu_key_k_type (key) == TYPE_DIRECT)
-	return "DIRECT";
-    if (cpu_key_k_type (key) == TYPE_INDIRECT)
-	return "IND";
-    return "UNKNOWN";
+	if (cpu_key_k_type(key) == TYPE_STAT_DATA)
+		return "SD";
+	if (cpu_key_k_type(key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (cpu_key_k_type(key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (cpu_key_k_type(key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
 }
 
-
-static char * le_type (struct reiserfs_key * key)
+static char *le_type(struct reiserfs_key *key)
 {
-    int version;
-    
-    version = le_key_version (key);
+	int version;
 
-    if (le_key_k_type (version, key) == TYPE_STAT_DATA)
-	return "SD";
-    if (le_key_k_type (version, key) == TYPE_DIRENTRY)
-	return "DIR";
-    if (le_key_k_type (version, key) == TYPE_DIRECT)
-	return "DIRECT";
-    if (le_key_k_type (version, key) == TYPE_INDIRECT)
-	return "IND";
-    return "UNKNOWN";
-}
+	version = le_key_version(key);
 
+	if (le_key_k_type(version, key) == TYPE_STAT_DATA)
+		return "SD";
+	if (le_key_k_type(version, key) == TYPE_DIRENTRY)
+		return "DIR";
+	if (le_key_k_type(version, key) == TYPE_DIRECT)
+		return "DIRECT";
+	if (le_key_k_type(version, key) == TYPE_INDIRECT)
+		return "IND";
+	return "UNKNOWN";
+}
 
 /* %k */
-static void sprintf_le_key (char * buf, struct reiserfs_key * key)
+static void sprintf_le_key(char *buf, struct reiserfs_key *key)
 {
-  if (key)
-    sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id),
-	     le32_to_cpu (key->k_objectid), le_offset (key), le_type (key));
-  else
-    sprintf (buf, "[NULL]");
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", le32_to_cpu(key->k_dir_id),
+			le32_to_cpu(key->k_objectid), le_offset(key),
+			le_type(key));
+	else
+		sprintf(buf, "[NULL]");
 }
 
-
 /* %K */
-static void sprintf_cpu_key (char * buf, struct cpu_key * key)
+static void sprintf_cpu_key(char *buf, struct cpu_key *key)
 {
-  if (key)
-    sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id,
-	     key->on_disk_key.k_objectid, reiserfs_cpu_offset (key),
-             cpu_type (key));
-  else
-    sprintf (buf, "[NULL]");
+	if (key)
+		sprintf(buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id,
+			key->on_disk_key.k_objectid, reiserfs_cpu_offset(key),
+			cpu_type(key));
+	else
+		sprintf(buf, "[NULL]");
 }
 
-static void sprintf_de_head( char *buf, struct reiserfs_de_head *deh )
+static void sprintf_de_head(char *buf, struct reiserfs_de_head *deh)
 {
-    if( deh )
-        sprintf( buf, "[offset=%d dir_id=%d objectid=%d location=%d state=%04x]", deh_offset(deh), deh_dir_id(deh),
-                 deh_objectid(deh), deh_location(deh), deh_state(deh) );
-    else
-        sprintf( buf, "[NULL]" );
+	if (deh)
+		sprintf(buf,
+			"[offset=%d dir_id=%d objectid=%d location=%d state=%04x]",
+			deh_offset(deh), deh_dir_id(deh), deh_objectid(deh),
+			deh_location(deh), deh_state(deh));
+	else
+		sprintf(buf, "[NULL]");
 
 }
 
-static void sprintf_item_head (char * buf, struct item_head * ih)
+static void sprintf_item_head(char *buf, struct item_head *ih)
 {
-    if (ih) {
-	strcpy (buf, (ih_version (ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*");
-	sprintf_le_key (buf + strlen (buf), &(ih->ih_key));
-	sprintf (buf + strlen (buf), ", item_len %d, item_location %d, "
-		 "free_space(entry_count) %d",
-		 ih_item_len(ih), ih_location(ih), ih_free_space (ih));
-    } else
-	sprintf (buf, "[NULL]");
+	if (ih) {
+		strcpy(buf,
+		       (ih_version(ih) == KEY_FORMAT_3_6) ? "*3.6* " : "*3.5*");
+		sprintf_le_key(buf + strlen(buf), &(ih->ih_key));
+		sprintf(buf + strlen(buf), ", item_len %d, item_location %d, "
+			"free_space(entry_count) %d",
+			ih_item_len(ih), ih_location(ih), ih_free_space(ih));
+	} else
+		sprintf(buf, "[NULL]");
 }
 
-
-static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de)
+static void sprintf_direntry(char *buf, struct reiserfs_dir_entry *de)
 {
-  char name[20];
+	char name[20];
 
-  memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
-  name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
-  sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid);
+	memcpy(name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen);
+	name[de->de_namelen > 19 ? 19 : de->de_namelen] = 0;
+	sprintf(buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid);
 }
 
-
-static void sprintf_block_head (char * buf, struct buffer_head * bh)
+static void sprintf_block_head(char *buf, struct buffer_head *bh)
 {
-  sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ",
-	   B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh));
+	sprintf(buf, "level=%d, nr_items=%d, free_space=%d rdkey ",
+		B_LEVEL(bh), B_NR_ITEMS(bh), B_FREE_SPACE(bh));
 }
 
-
-static void sprintf_buffer_head (char * buf, struct buffer_head * bh) 
+static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
 {
-  char b[BDEVNAME_SIZE];
+	char b[BDEVNAME_SIZE];
 
-  sprintf (buf, "dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
-	   bdevname (bh->b_bdev, b), bh->b_size,
-	   (unsigned long long)bh->b_blocknr,
-	   atomic_read (&(bh->b_count)),
-	   bh->b_state, bh->b_page,
-	   buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE",
-	   buffer_dirty (bh) ? "DIRTY" : "CLEAN",
-	   buffer_locked (bh) ? "LOCKED" : "UNLOCKED");
+	sprintf(buf,
+		"dev %s, size %d, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+		bdevname(bh->b_bdev, b), bh->b_size,
+		(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
+		bh->b_state, bh->b_page,
+		buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
+		buffer_dirty(bh) ? "DIRTY" : "CLEAN",
+		buffer_locked(bh) ? "LOCKED" : "UNLOCKED");
 }
 
-
-static void sprintf_disk_child (char * buf, struct disk_child * dc)
+static void sprintf_disk_child(char *buf, struct disk_child *dc)
 {
-  sprintf (buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc), dc_size(dc));
+	sprintf(buf, "[dc_number=%d, dc_size=%u]", dc_block_number(dc),
+		dc_size(dc));
 }
 
-
-static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip)
+static char *is_there_reiserfs_struct(char *fmt, int *what, int *skip)
 {
-  char * k = fmt;
+	char *k = fmt;
 
-  *skip = 0;
-  
-  while ((k = strchr (k, '%')) != NULL)
-  {
-    if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
-	      k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a' ) {
-      *what = k[1];
-      break;
-    }
-    (*skip) ++;
-    k ++;
-  }
-  return k;
-}
+	*skip = 0;
 
+	while ((k = strchr(k, '%')) != NULL) {
+		if (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' ||
+		    k[1] == 'z' || k[1] == 'b' || k[1] == 'y' || k[1] == 'a') {
+			*what = k[1];
+			break;
+		}
+		(*skip)++;
+		k++;
+	}
+	return k;
+}
 
 /* debugging reiserfs we used to print out a lot of different
    variables, like keys, item headers, buffer heads etc. Values of
@@ -191,61 +189,64 @@ static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip)
            key->k_offset, key->k_uniqueness); 
 */
 
-
-static void
-prepare_error_buf( const char *fmt, va_list args )
-{
-    char * fmt1 = fmt_buf;
-    char * k;
-    char * p = error_buf;
-    int i, j, what, skip;
-
-    strcpy (fmt1, fmt);
-
-    while( (k = is_there_reiserfs_struct( fmt1, &what, &skip )) != NULL )
-    {
-        *k = 0;
-
-        p += vsprintf (p, fmt1, args);
-
-        for (i = 0; i < skip; i ++)
-            j = va_arg (args, int);
-
-        switch (what) {
-        case 'k':
-            sprintf_le_key (p, va_arg(args, struct reiserfs_key *));
-            break;
-        case 'K':
-            sprintf_cpu_key (p, va_arg(args, struct cpu_key *));
-            break;
-        case 'h':
-            sprintf_item_head (p, va_arg(args, struct item_head *));
-            break;
-        case 't':
-            sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *));
-            break;
-        case 'y':
-            sprintf_disk_child (p, va_arg(args, struct disk_child *));
-            break;
-        case 'z':
-            sprintf_block_head (p, va_arg(args, struct buffer_head *));
-            break;
-        case 'b':
-            sprintf_buffer_head (p, va_arg(args, struct buffer_head *));
-            break;
-        case 'a':
-            sprintf_de_head (p, va_arg(args, struct reiserfs_de_head *));
-            break;
-        }
-
-        p += strlen (p);
-        fmt1 = k + 2;
-    }
-    vsprintf (p, fmt1, args);
+static void prepare_error_buf(const char *fmt, va_list args)
+{
+	char *fmt1 = fmt_buf;
+	char *k;
+	char *p = error_buf;
+	int i, j, what, skip;
+
+	strcpy(fmt1, fmt);
+
+	while ((k = is_there_reiserfs_struct(fmt1, &what, &skip)) != NULL) {
+		*k = 0;
+
+		p += vsprintf(p, fmt1, args);
+
+		for (i = 0; i < skip; i++)
+			j = va_arg(args, int);
+
+		switch (what) {
+		case 'k':
+			sprintf_le_key(p, va_arg(args, struct reiserfs_key *));
+			break;
+		case 'K':
+			sprintf_cpu_key(p, va_arg(args, struct cpu_key *));
+			break;
+		case 'h':
+			sprintf_item_head(p, va_arg(args, struct item_head *));
+			break;
+		case 't':
+			sprintf_direntry(p,
+					 va_arg(args,
+						struct reiserfs_dir_entry *));
+			break;
+		case 'y':
+			sprintf_disk_child(p,
+					   va_arg(args, struct disk_child *));
+			break;
+		case 'z':
+			sprintf_block_head(p,
+					   va_arg(args, struct buffer_head *));
+			break;
+		case 'b':
+			sprintf_buffer_head(p,
+					    va_arg(args, struct buffer_head *));
+			break;
+		case 'a':
+			sprintf_de_head(p,
+					va_arg(args,
+					       struct reiserfs_de_head *));
+			break;
+		}
+
+		p += strlen(p);
+		fmt1 = k + 2;
+	}
+	vsprintf(p, fmt1, args);
 
 }
 
-
 /* in addition to usual conversion specifiers this accepts reiserfs
    specific conversion specifiers: 
    %k to print little endian key, 
@@ -264,43 +265,43 @@ prepare_error_buf( const char *fmt, va_list args )
     va_end( args );\
 }
 
-void reiserfs_warning (struct super_block *sb, const char * fmt, ...)
+void reiserfs_warning(struct super_block *sb, const char *fmt, ...)
 {
-  do_reiserfs_warning(fmt);
-  if (sb)
-      printk (KERN_WARNING "ReiserFS: %s: warning: %s\n",
-             reiserfs_bdevname (sb), error_buf);
-  else
-      printk (KERN_WARNING "ReiserFS: warning: %s\n", error_buf);
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_WARNING "ReiserFS: %s: warning: %s\n",
+		       reiserfs_bdevname(sb), error_buf);
+	else
+		printk(KERN_WARNING "ReiserFS: warning: %s\n", error_buf);
 }
 
 /* No newline.. reiserfs_info calls can be followed by printk's */
-void reiserfs_info (struct super_block *sb, const char * fmt, ...)
+void reiserfs_info(struct super_block *sb, const char *fmt, ...)
 {
-  do_reiserfs_warning(fmt);
-  if (sb)
-      printk (KERN_NOTICE "ReiserFS: %s: %s",
-             reiserfs_bdevname (sb), error_buf);
-  else
-      printk (KERN_NOTICE "ReiserFS: %s", error_buf);
+	do_reiserfs_warning(fmt);
+	if (sb)
+		printk(KERN_NOTICE "ReiserFS: %s: %s",
+		       reiserfs_bdevname(sb), error_buf);
+	else
+		printk(KERN_NOTICE "ReiserFS: %s", error_buf);
 }
 
 /* No newline.. reiserfs_printk calls can be followed by printk's */
-static void reiserfs_printk (const char * fmt, ...)
+static void reiserfs_printk(const char *fmt, ...)
 {
-  do_reiserfs_warning(fmt);
-  printk (error_buf);
+	do_reiserfs_warning(fmt);
+	printk(error_buf);
 }
 
-void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...)
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
 {
 #ifdef CONFIG_REISERFS_CHECK
-  do_reiserfs_warning(fmt);
-  if (s)
-      printk (KERN_DEBUG "ReiserFS: %s: %s\n",
-             reiserfs_bdevname (s), error_buf);
-  else
-      printk (KERN_DEBUG "ReiserFS: %s\n", error_buf);
+	do_reiserfs_warning(fmt);
+	if (s)
+		printk(KERN_DEBUG "ReiserFS: %s: %s\n",
+		       reiserfs_bdevname(s), error_buf);
+	else
+		printk(KERN_DEBUG "ReiserFS: %s\n", error_buf);
 #endif
 }
 
@@ -349,379 +350,403 @@ void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...)
 
    .  */
 
-
 #ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance * cur_tb;
+extern struct tree_balance *cur_tb;
 #endif
 
-void reiserfs_panic (struct super_block * sb, const char * fmt, ...)
+void reiserfs_panic(struct super_block *sb, const char *fmt, ...)
 {
-  do_reiserfs_warning(fmt);
-  printk (KERN_EMERG "REISERFS: panic (device %s): %s\n",
-          reiserfs_bdevname (sb), error_buf);
-  BUG ();
+	do_reiserfs_warning(fmt);
+	printk(KERN_EMERG "REISERFS: panic (device %s): %s\n",
+	       reiserfs_bdevname(sb), error_buf);
+	BUG();
 
-  /* this is not actually called, but makes reiserfs_panic() "noreturn" */
-  panic ("REISERFS: panic (device %s): %s\n",
-	 reiserfs_bdevname (sb), error_buf);
+	/* this is not actually called, but makes reiserfs_panic() "noreturn" */
+	panic("REISERFS: panic (device %s): %s\n",
+	      reiserfs_bdevname(sb), error_buf);
 }
 
-void
-reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...)
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...)
 {
-    do_reiserfs_warning (fmt);
+	do_reiserfs_warning(fmt);
 
-    if (reiserfs_error_panic (sb)) {
-        panic (KERN_CRIT "REISERFS: panic (device %s): %s\n",
-               reiserfs_bdevname (sb), error_buf);
-    }
+	if (reiserfs_error_panic(sb)) {
+		panic(KERN_CRIT "REISERFS: panic (device %s): %s\n",
+		      reiserfs_bdevname(sb), error_buf);
+	}
 
-    if (sb->s_flags & MS_RDONLY)
-        return;
+	if (sb->s_flags & MS_RDONLY)
+		return;
 
-    printk (KERN_CRIT "REISERFS: abort (device %s): %s\n",
-            reiserfs_bdevname (sb), error_buf);
+	printk(KERN_CRIT "REISERFS: abort (device %s): %s\n",
+	       reiserfs_bdevname(sb), error_buf);
 
-    sb->s_flags |= MS_RDONLY;
-    reiserfs_journal_abort (sb, errno);
+	sb->s_flags |= MS_RDONLY;
+	reiserfs_journal_abort(sb, errno);
 }
 
 /* this prints internal nodes (4 keys/items in line) (dc_number,
    dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number,
    dc_size)...*/
-static int print_internal (struct buffer_head * bh, int first, int last)
+static int print_internal(struct buffer_head *bh, int first, int last)
 {
-    struct reiserfs_key * key;
-    struct disk_child * dc;
-    int i;
-    int from, to;
-    
-    if (!B_IS_KEYS_LEVEL (bh))
-	return 1;
-
-    check_internal (bh);
-    
-    if (first == -1) {
-	from = 0;
-	to = B_NR_ITEMS (bh);
-    } else {
-	from = first;
-	to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh);
-    }
-
-    reiserfs_printk ("INTERNAL NODE (%ld) contains %z\n",  bh->b_blocknr, bh);
-    
-    dc = B_N_CHILD (bh, from);
-    reiserfs_printk ("PTR %d: %y ", from, dc);
-    
-    for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) {
-	reiserfs_printk ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
-	if (i && i % 4 == 0)
-	    printk ("\n");
-    }
-    printk ("\n");
-    return 0;
-}
+	struct reiserfs_key *key;
+	struct disk_child *dc;
+	int i;
+	int from, to;
 
+	if (!B_IS_KEYS_LEVEL(bh))
+		return 1;
 
+	check_internal(bh);
 
+	if (first == -1) {
+		from = 0;
+		to = B_NR_ITEMS(bh);
+	} else {
+		from = first;
+		to = last < B_NR_ITEMS(bh) ? last : B_NR_ITEMS(bh);
+	}
 
+	reiserfs_printk("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh);
 
-static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last)
-{
-    struct block_head * blkh;
-    struct item_head * ih;
-    int i, nr;
-    int from, to;
+	dc = B_N_CHILD(bh, from);
+	reiserfs_printk("PTR %d: %y ", from, dc);
 
-    if (!B_IS_ITEMS_LEVEL (bh))
-	return 1;
+	for (i = from, key = B_N_PDELIM_KEY(bh, from), dc++; i < to;
+	     i++, key++, dc++) {
+		reiserfs_printk("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc);
+		if (i && i % 4 == 0)
+			printk("\n");
+	}
+	printk("\n");
+	return 0;
+}
 
-    check_leaf (bh);
+static int print_leaf(struct buffer_head *bh, int print_mode, int first,
+		      int last)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int i, nr;
+	int from, to;
 
-    blkh = B_BLK_HEAD (bh);
-    ih = B_N_PITEM_HEAD (bh,0);
-    nr = blkh_nr_item(blkh);
+	if (!B_IS_ITEMS_LEVEL(bh))
+		return 1;
 
-    printk ("\n===================================================================\n");
-    reiserfs_printk ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
+	check_leaf(bh);
 
-    if (!(print_mode & PRINT_LEAF_ITEMS)) {
-	reiserfs_printk ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
-			  &(ih->ih_key), &((ih + nr - 1)->ih_key));
-	return 0;
-    }
+	blkh = B_BLK_HEAD(bh);
+	ih = B_N_PITEM_HEAD(bh, 0);
+	nr = blkh_nr_item(blkh);
 
-    if (first < 0 || first > nr - 1) 
-	from = 0;
-    else 
-	from = first;
+	printk
+	    ("\n===================================================================\n");
+	reiserfs_printk("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh);
 
-    if (last < 0 || last > nr )
-	to = nr;
-    else
-	to = last;
+	if (!(print_mode & PRINT_LEAF_ITEMS)) {
+		reiserfs_printk("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n",
+				&(ih->ih_key), &((ih + nr - 1)->ih_key));
+		return 0;
+	}
 
-    ih += from;
-    printk ("-------------------------------------------------------------------------------\n");
-    printk ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
-    for (i = from; i < to; i++, ih ++) {
-	printk ("-------------------------------------------------------------------------------\n");
-	reiserfs_printk ("|%2d| %h |\n", i, ih);
-	if (print_mode & PRINT_LEAF_ITEMS)
-	    op_print_item (ih, B_I_PITEM (bh, ih));
-    }
+	if (first < 0 || first > nr - 1)
+		from = 0;
+	else
+		from = first;
+
+	if (last < 0 || last > nr)
+		to = nr;
+	else
+		to = last;
+
+	ih += from;
+	printk
+	    ("-------------------------------------------------------------------------------\n");
+	printk
+	    ("|##|   type    |           key           | ilen | free_space | version | loc  |\n");
+	for (i = from; i < to; i++, ih++) {
+		printk
+		    ("-------------------------------------------------------------------------------\n");
+		reiserfs_printk("|%2d| %h |\n", i, ih);
+		if (print_mode & PRINT_LEAF_ITEMS)
+			op_print_item(ih, B_I_PITEM(bh, ih));
+	}
 
-    printk ("===================================================================\n");
+	printk
+	    ("===================================================================\n");
 
-    return 0;
+	return 0;
 }
 
-char * reiserfs_hashname(int code)
+char *reiserfs_hashname(int code)
 {
-    if ( code == YURA_HASH)
-	return "rupasov";
-    if ( code == TEA_HASH)
-	return "tea";
-    if ( code == R5_HASH)
-	return "r5";
+	if (code == YURA_HASH)
+		return "rupasov";
+	if (code == TEA_HASH)
+		return "tea";
+	if (code == R5_HASH)
+		return "r5";
 
-    return "unknown";
+	return "unknown";
 }
 
 /* return 1 if this is not super block */
-static int print_super_block (struct buffer_head * bh)
-{
-    struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data);
-    int skipped, data_blocks;
-    char *version;
-    char b[BDEVNAME_SIZE];
-
-    if (is_reiserfs_3_5(rs)) {
-        version = "3.5";
-    } else if (is_reiserfs_3_6(rs)) {
-        version = "3.6";
-    } else if (is_reiserfs_jr(rs)) {
-      version = ((sb_version(rs) == REISERFS_VERSION_2) ?
- 		 "3.6" : "3.5");  
-    } else {
-	return 1;
-    }
-
-    printk ("%s\'s super block is in block %llu\n", bdevname (bh->b_bdev, b),
-            (unsigned long long)bh->b_blocknr);
-    printk ("Reiserfs version %s\n", version );
-    printk ("Block count %u\n", sb_block_count(rs));
-    printk ("Blocksize %d\n", sb_blocksize(rs));
-    printk ("Free blocks %u\n", sb_free_blocks(rs));
-    // FIXME: this would be confusing if
-    // someone stores reiserfs super block in some data block ;)
+static int print_super_block(struct buffer_head *bh)
+{
+	struct reiserfs_super_block *rs =
+	    (struct reiserfs_super_block *)(bh->b_data);
+	int skipped, data_blocks;
+	char *version;
+	char b[BDEVNAME_SIZE];
+
+	if (is_reiserfs_3_5(rs)) {
+		version = "3.5";
+	} else if (is_reiserfs_3_6(rs)) {
+		version = "3.6";
+	} else if (is_reiserfs_jr(rs)) {
+		version = ((sb_version(rs) == REISERFS_VERSION_2) ?
+			   "3.6" : "3.5");
+	} else {
+		return 1;
+	}
+
+	printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+	       (unsigned long long)bh->b_blocknr);
+	printk("Reiserfs version %s\n", version);
+	printk("Block count %u\n", sb_block_count(rs));
+	printk("Blocksize %d\n", sb_blocksize(rs));
+	printk("Free blocks %u\n", sb_free_blocks(rs));
+	// FIXME: this would be confusing if
+	// someone stores reiserfs super block in some data block ;)
 //    skipped = (bh->b_blocknr * bh->b_size) / sb_blocksize(rs);
-    skipped = bh->b_blocknr;
-    data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
-	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) + 1 : sb_reserved_for_journal(rs)) -	    
-	    sb_free_blocks(rs);
-    printk ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
-	    "1 super block, %d data blocks\n", 
-	    skipped, sb_bmap_nr(rs), (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
-				      sb_reserved_for_journal(rs)) , data_blocks);
-    printk ("Root block %u\n", sb_root_block(rs));
-    printk ("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
-    printk ("Journal dev %d\n", sb_jp_journal_dev(rs));
-    printk ("Journal orig size %d\n", sb_jp_journal_size(rs));
-    printk ("FS state %d\n", sb_fs_state(rs));
-    printk ("Hash function \"%s\"\n",
-	    reiserfs_hashname(sb_hash_function_code(rs)));
-    
-    printk ("Tree height %d\n", sb_tree_height(rs));
-    return 0;
+	skipped = bh->b_blocknr;
+	data_blocks = sb_block_count(rs) - skipped - 1 - sb_bmap_nr(rs) -
+	    (!is_reiserfs_jr(rs) ? sb_jp_journal_size(rs) +
+	     1 : sb_reserved_for_journal(rs)) - sb_free_blocks(rs);
+	printk
+	    ("Busy blocks (skipped %d, bitmaps - %d, journal (or reserved) blocks - %d\n"
+	     "1 super block, %d data blocks\n", skipped, sb_bmap_nr(rs),
+	     (!is_reiserfs_jr(rs) ? (sb_jp_journal_size(rs) + 1) :
+	      sb_reserved_for_journal(rs)), data_blocks);
+	printk("Root block %u\n", sb_root_block(rs));
+	printk("Journal block (first) %d\n", sb_jp_journal_1st_block(rs));
+	printk("Journal dev %d\n", sb_jp_journal_dev(rs));
+	printk("Journal orig size %d\n", sb_jp_journal_size(rs));
+	printk("FS state %d\n", sb_fs_state(rs));
+	printk("Hash function \"%s\"\n",
+	       reiserfs_hashname(sb_hash_function_code(rs)));
+
+	printk("Tree height %d\n", sb_tree_height(rs));
+	return 0;
 }
 
-static int print_desc_block (struct buffer_head * bh)
+static int print_desc_block(struct buffer_head *bh)
 {
-    struct reiserfs_journal_desc * desc;
+	struct reiserfs_journal_desc *desc;
 
-    if (memcmp(get_journal_desc_magic (bh), JOURNAL_DESC_MAGIC, 8))
-	return 1;
+	if (memcmp(get_journal_desc_magic(bh), JOURNAL_DESC_MAGIC, 8))
+		return 1;
 
-    desc = (struct reiserfs_journal_desc *)(bh->b_data);
-    printk ("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
-	    (unsigned long long)bh->b_blocknr, get_desc_trans_id (desc), get_desc_mount_id (desc),
-	    get_desc_trans_len (desc));
+	desc = (struct reiserfs_journal_desc *)(bh->b_data);
+	printk("Desc block %llu (j_trans_id %d, j_mount_id %d, j_len %d)",
+	       (unsigned long long)bh->b_blocknr, get_desc_trans_id(desc),
+	       get_desc_mount_id(desc), get_desc_trans_len(desc));
 
-    return 0;
+	return 0;
 }
 
-
-void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last)
+void print_block(struct buffer_head *bh, ...)	//int print_mode, int first, int last)
 {
-    va_list args;
-    int mode, first, last;
+	va_list args;
+	int mode, first, last;
 
-    va_start (args, bh);
+	va_start(args, bh);
 
-    if ( ! bh ) {
-	printk("print_block: buffer is NULL\n");
-	return;
-    }
+	if (!bh) {
+		printk("print_block: buffer is NULL\n");
+		return;
+	}
 
-    mode = va_arg (args, int);
-    first = va_arg (args, int);
-    last = va_arg (args, int);
-    if (print_leaf (bh, mode, first, last))
-	if (print_internal (bh, first, last))
-	    if (print_super_block (bh))
-		if (print_desc_block (bh))
-		    printk ("Block %llu contains unformatted data\n", (unsigned long long)bh->b_blocknr);
+	mode = va_arg(args, int);
+	first = va_arg(args, int);
+	last = va_arg(args, int);
+	if (print_leaf(bh, mode, first, last))
+		if (print_internal(bh, first, last))
+			if (print_super_block(bh))
+				if (print_desc_block(bh))
+					printk
+					    ("Block %llu contains unformatted data\n",
+					     (unsigned long long)bh->b_blocknr);
 }
 
-
-
 static char print_tb_buf[2048];
 
 /* this stores initial state of tree balance in the print_tb_buf */
-void store_print_tb (struct tree_balance * tb)
-{
-    int h = 0;
-    int i;
-    struct buffer_head * tbSh, * tbFh;
-
-    if (!tb)
-	return;
-
-    sprintf (print_tb_buf, "\n"
-	     "BALANCING %d\n"
-	     "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" 
-	     "=====================================================================\n"
-	     "* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
-	     REISERFS_SB(tb->tb_sb)->s_do_balance,
-	     tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item);
-  
-    for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) {
-	if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length && 
-	    PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
-	    tbSh = PATH_H_PBUFFER (tb->tb_path, h);
-	    tbFh = PATH_H_PPARENT (tb->tb_path, h);
-	} else {
-	    tbSh = NULL;
-	    tbFh = NULL;
+void store_print_tb(struct tree_balance *tb)
+{
+	int h = 0;
+	int i;
+	struct buffer_head *tbSh, *tbFh;
+
+	if (!tb)
+		return;
+
+	sprintf(print_tb_buf, "\n"
+		"BALANCING %d\n"
+		"MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n"
+		"=====================================================================\n"
+		"* h *    S    *    L    *    R    *   F   *   FL  *   FR  *  CFL  *  CFR  *\n",
+		REISERFS_SB(tb->tb_sb)->s_do_balance,
+		tb->tb_mode, PATH_LAST_POSITION(tb->tb_path),
+		tb->tb_path->pos_in_item);
+
+	for (h = 0; h < sizeof(tb->insert_size) / sizeof(tb->insert_size[0]);
+	     h++) {
+		if (PATH_H_PATH_OFFSET(tb->tb_path, h) <=
+		    tb->tb_path->path_length
+		    && PATH_H_PATH_OFFSET(tb->tb_path,
+					  h) > ILLEGAL_PATH_ELEMENT_OFFSET) {
+			tbSh = PATH_H_PBUFFER(tb->tb_path, h);
+			tbFh = PATH_H_PPARENT(tb->tb_path, h);
+		} else {
+			tbSh = NULL;
+			tbFh = NULL;
+		}
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
+			h,
+			(tbSh) ? (long long)(tbSh->b_blocknr) : (-1LL),
+			(tbSh) ? atomic_read(&(tbSh->b_count)) : -1,
+			(tb->L[h]) ? (long long)(tb->L[h]->b_blocknr) : (-1LL),
+			(tb->L[h]) ? atomic_read(&(tb->L[h]->b_count)) : -1,
+			(tb->R[h]) ? (long long)(tb->R[h]->b_blocknr) : (-1LL),
+			(tb->R[h]) ? atomic_read(&(tb->R[h]->b_count)) : -1,
+			(tbFh) ? (long long)(tbFh->b_blocknr) : (-1LL),
+			(tb->FL[h]) ? (long long)(tb->FL[h]->
+						  b_blocknr) : (-1LL),
+			(tb->FR[h]) ? (long long)(tb->FR[h]->
+						  b_blocknr) : (-1LL),
+			(tb->CFL[h]) ? (long long)(tb->CFL[h]->
+						   b_blocknr) : (-1LL),
+			(tb->CFR[h]) ? (long long)(tb->CFR[h]->
+						   b_blocknr) : (-1LL));
 	}
-	sprintf (print_tb_buf + strlen (print_tb_buf),
-		 "* %d * %3lld(%2d) * %3lld(%2d) * %3lld(%2d) * %5lld * %5lld * %5lld * %5lld * %5lld *\n",
-		 h, 
-		 (tbSh) ? (long long)(tbSh->b_blocknr):(-1LL),
-		 (tbSh) ? atomic_read (&(tbSh->b_count)) : -1,
-		 (tb->L[h]) ? (long long)(tb->L[h]->b_blocknr):(-1LL),
-		 (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1,
-		 (tb->R[h]) ? (long long)(tb->R[h]->b_blocknr):(-1LL),
-		 (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1,
-		 (tbFh) ? (long long)(tbFh->b_blocknr):(-1LL),
-		 (tb->FL[h]) ? (long long)(tb->FL[h]->b_blocknr):(-1LL),
-		 (tb->FR[h]) ? (long long)(tb->FR[h]->b_blocknr):(-1LL),
-		 (tb->CFL[h]) ? (long long)(tb->CFL[h]->b_blocknr):(-1LL),
-		 (tb->CFR[h]) ? (long long)(tb->CFR[h]->b_blocknr):(-1LL));
-    }
-
-    sprintf (print_tb_buf + strlen (print_tb_buf), 
-	     "=====================================================================\n"
-	     "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
-	     "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
-	     tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0], 
-	     tb->s0num, tb->s1num,tb->s1bytes,  tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]);
-
-    /* this prints balance parameters for non-leaf levels */
-    h = 0;
-    do {
-	h++;
-	sprintf (print_tb_buf + strlen (print_tb_buf),
-		 "* %d * %4d * %2d *    * %2d *    * %2d *\n",
-		h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]);
-    } while (tb->insert_size[h]);
-
-    sprintf (print_tb_buf + strlen (print_tb_buf), 
-	     "=====================================================================\n"
-	     "FEB list: ");
-
-    /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
-    h = 0;
-    for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++)
-	sprintf (print_tb_buf + strlen (print_tb_buf),
-		 "%p (%llu %d)%s", tb->FEB[i], tb->FEB[i] ? (unsigned long long)tb->FEB[i]->b_blocknr : 0ULL,
-		 tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0, 
-		 (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", ");
-
-    sprintf (print_tb_buf + strlen (print_tb_buf), 
-	     "======================== the end ====================================\n");
-}
-
-void print_cur_tb (char * mes)
-{
-    printk ("%s\n%s", mes, print_tb_buf);
-}
-
-static void check_leaf_block_head (struct buffer_head * bh)
-{
-  struct block_head * blkh;
-  int nr;
-
-  blkh = B_BLK_HEAD (bh);
-  nr = blkh_nr_item(blkh);
-  if ( nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-    reiserfs_panic (NULL, "vs-6010: check_leaf_block_head: invalid item number %z", bh);
-  if ( blkh_free_space(blkh) > 
-      bh->b_size - BLKH_SIZE - IH_SIZE * nr )
-    reiserfs_panic (NULL, "vs-6020: check_leaf_block_head: invalid free space %z", bh);
-    
-}
 
-static void check_internal_block_head (struct buffer_head * bh)
-{
-    struct block_head * blkh;
-    
-    blkh = B_BLK_HEAD (bh);
-    if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT))
-	reiserfs_panic (NULL, "vs-6025: check_internal_block_head: invalid level %z", bh);
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n"
+		"* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n",
+		tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],
+		tb->rbytes, tb->blknum[0], tb->s0num, tb->s1num, tb->s1bytes,
+		tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0],
+		tb->rkey[0]);
+
+	/* this prints balance parameters for non-leaf levels */
+	h = 0;
+	do {
+		h++;
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"* %d * %4d * %2d *    * %2d *    * %2d *\n",
+			h, tb->insert_size[h], tb->lnum[h], tb->rnum[h],
+			tb->blknum[h]);
+	} while (tb->insert_size[h]);
 
-    if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
-	reiserfs_panic (NULL, "vs-6030: check_internal_block_head: invalid item number %z", bh);
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"=====================================================================\n"
+		"FEB list: ");
 
-    if (B_FREE_SPACE (bh) != 
-	bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1))
-	reiserfs_panic (NULL, "vs-6040: check_internal_block_head: invalid free space %z", bh);
+	/* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */
+	h = 0;
+	for (i = 0; i < sizeof(tb->FEB) / sizeof(tb->FEB[0]); i++)
+		sprintf(print_tb_buf + strlen(print_tb_buf),
+			"%p (%llu %d)%s", tb->FEB[i],
+			tb->FEB[i] ? (unsigned long long)tb->FEB[i]->
+			b_blocknr : 0ULL,
+			tb->FEB[i] ? atomic_read(&(tb->FEB[i]->b_count)) : 0,
+			(i ==
+			 sizeof(tb->FEB) / sizeof(tb->FEB[0]) -
+			 1) ? "\n" : ", ");
 
+	sprintf(print_tb_buf + strlen(print_tb_buf),
+		"======================== the end ====================================\n");
 }
 
+void print_cur_tb(char *mes)
+{
+	printk("%s\n%s", mes, print_tb_buf);
+}
 
-void check_leaf (struct buffer_head * bh)
+static void check_leaf_block_head(struct buffer_head *bh)
 {
-    int i;
-    struct item_head * ih;
+	struct block_head *blkh;
+	int nr;
+
+	blkh = B_BLK_HEAD(bh);
+	nr = blkh_nr_item(blkh);
+	if (nr > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL,
+			       "vs-6010: check_leaf_block_head: invalid item number %z",
+			       bh);
+	if (blkh_free_space(blkh) > bh->b_size - BLKH_SIZE - IH_SIZE * nr)
+		reiserfs_panic(NULL,
+			       "vs-6020: check_leaf_block_head: invalid free space %z",
+			       bh);
 
-    if (!bh)
-	return;
-    check_leaf_block_head (bh);
-    for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++)
-	op_check_item (ih, B_I_PITEM (bh, ih));
 }
 
+static void check_internal_block_head(struct buffer_head *bh)
+{
+	struct block_head *blkh;
+
+	blkh = B_BLK_HEAD(bh);
+	if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT))
+		reiserfs_panic(NULL,
+			       "vs-6025: check_internal_block_head: invalid level %z",
+			       bh);
+
+	if (B_NR_ITEMS(bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE)
+		reiserfs_panic(NULL,
+			       "vs-6030: check_internal_block_head: invalid item number %z",
+			       bh);
+
+	if (B_FREE_SPACE(bh) !=
+	    bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS(bh) -
+	    DC_SIZE * (B_NR_ITEMS(bh) + 1))
+		reiserfs_panic(NULL,
+			       "vs-6040: check_internal_block_head: invalid free space %z",
+			       bh);
+
+}
 
-void check_internal (struct buffer_head * bh)
+void check_leaf(struct buffer_head *bh)
 {
-  if (!bh)
-    return;
-  check_internal_block_head (bh);
+	int i;
+	struct item_head *ih;
+
+	if (!bh)
+		return;
+	check_leaf_block_head(bh);
+	for (i = 0, ih = B_N_PITEM_HEAD(bh, 0); i < B_NR_ITEMS(bh); i++, ih++)
+		op_check_item(ih, B_I_PITEM(bh, ih));
 }
 
+void check_internal(struct buffer_head *bh)
+{
+	if (!bh)
+		return;
+	check_internal_block_head(bh);
+}
 
-void print_statistics (struct super_block * s)
+void print_statistics(struct super_block *s)
 {
 
-  /*
-  printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
-bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
-	  REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
-	  REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
-	  REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
-  */
+	/*
+	   printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, \
+	   bmap with search %d, without %d, dir2ind %d, ind2dir %d\n",
+	   REISERFS_SB(s)->s_do_balance, REISERFS_SB(s)->s_fix_nodes,
+	   REISERFS_SB(s)->s_bmaps, REISERFS_SB(s)->s_bmaps_without_search,
+	   REISERFS_SB(s)->s_direct2indirect, REISERFS_SB(s)->s_indirect2direct);
+	 */
 
 }
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index e242ebc7f6f6..fc2f43c75df4 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -33,28 +33,27 @@
 static int show_version(struct seq_file *m, struct super_block *sb)
 {
 	char *format;
-    
-	if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6) ) {
+
+	if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) {
 		format = "3.6";
-	} else if ( REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5) ) {
+	} else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) {
 		format = "3.5";
 	} else {
 		format = "unknown";
 	}
 
-	seq_printf(m, "%s format\twith checks %s\n",
-			format,
+	seq_printf(m, "%s format\twith checks %s\n", format,
 #if defined( CONFIG_REISERFS_CHECK )
-			"on"
+		   "on"
 #else
-			"off"
+		   "off"
 #endif
-		);
+	    );
 	return 0;
 }
 
-int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset,
-				     int count, int *eof, void *data )
+int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
+				    int count, int *eof, void *data)
 {
 	*start = buffer;
 	*eof = 1;
@@ -79,87 +78,68 @@ int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset,
 
 #define DJF( x ) le32_to_cpu( rs -> x )
 #define DJV( x ) le32_to_cpu( s_v1 -> x )
-#define DJP( x ) le32_to_cpu( jp -> x ) 
+#define DJP( x ) le32_to_cpu( jp -> x )
 #define JF( x ) ( r -> s_journal -> x )
 
 static int show_super(struct seq_file *m, struct super_block *sb)
 {
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-    
-	seq_printf(m,	"state: \t%s\n"
-			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
-			"gen. counter: \t%i\n"
-			"s_kmallocs: \t%i\n"
-			"s_disk_reads: \t%i\n"
-			"s_disk_writes: \t%i\n"
-			"s_fix_nodes: \t%i\n"
-			"s_do_balance: \t%i\n"
-			"s_unneeded_left_neighbor: \t%i\n"
-			"s_good_search_by_key_reada: \t%i\n"
-			"s_bmaps: \t%i\n"
-			"s_bmaps_without_search: \t%i\n"
-			"s_direct2indirect: \t%i\n"
-			"s_indirect2direct: \t%i\n"
-			"\n"
-			"max_hash_collisions: \t%i\n"
-
-			"breads: \t%lu\n"
-			"bread_misses: \t%lu\n"
-
-			"search_by_key: \t%lu\n"
-			"search_by_key_fs_changed: \t%lu\n"
-			"search_by_key_restarted: \t%lu\n"
-			
-			"insert_item_restarted: \t%lu\n"
-			"paste_into_item_restarted: \t%lu\n"
-			"cut_from_item_restarted: \t%lu\n"
-			"delete_solid_item_restarted: \t%lu\n"
-			"delete_item_restarted: \t%lu\n"
-
-			"leaked_oid: \t%lu\n"
-			"leaves_removable: \t%lu\n",
-
-			SF( s_mount_state ) == REISERFS_VALID_FS ?
-			"REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
-			reiserfs_r5_hash( sb ) ? "FORCE_R5 " : "",
-			reiserfs_rupasov_hash( sb ) ? "FORCE_RUPASOV " : "",
-			reiserfs_tea_hash( sb ) ? "FORCE_TEA " : "",
-			reiserfs_hash_detect( sb ) ? "DETECT_HASH " : "",
-			reiserfs_no_border( sb ) ? "NO_BORDER " : "BORDER ",
-			reiserfs_no_unhashed_relocation( sb ) ? "NO_UNHASHED_RELOCATION " : "",
-			reiserfs_hashed_relocation( sb ) ? "UNHASHED_RELOCATION " : "",
-			reiserfs_test4( sb ) ? "TEST4 " : "",
-			have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
-			replay_only( sb ) ? "REPLAY_ONLY " : "",
-			convert_reiserfs( sb ) ? "CONV " : "",
-
-			atomic_read( &r -> s_generation_counter ),
-			SF( s_kmallocs ),
-			SF( s_disk_reads ),
-			SF( s_disk_writes ),
-			SF( s_fix_nodes ),
-			SF( s_do_balance ),
-			SF( s_unneeded_left_neighbor ),
-			SF( s_good_search_by_key_reada ),
-			SF( s_bmaps ),
-			SF( s_bmaps_without_search ),
-			SF( s_direct2indirect ),
-			SF( s_indirect2direct ),
-			SFP( max_hash_collisions ),
-			SFP( breads ),
-			SFP( bread_miss ),
-			SFP( search_by_key ),
-			SFP( search_by_key_fs_changed ),
-			SFP( search_by_key_restarted ),
-
-			SFP( insert_item_restarted ),
-			SFP( paste_into_item_restarted ),
-			SFP( cut_from_item_restarted ),
-			SFP( delete_solid_item_restarted ),
-			SFP( delete_item_restarted ),
-
-			SFP( leaked_oid ),
-			SFP( leaves_removable ) );
+
+	seq_printf(m, "state: \t%s\n"
+		   "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
+		   "gen. counter: \t%i\n"
+		   "s_kmallocs: \t%i\n"
+		   "s_disk_reads: \t%i\n"
+		   "s_disk_writes: \t%i\n"
+		   "s_fix_nodes: \t%i\n"
+		   "s_do_balance: \t%i\n"
+		   "s_unneeded_left_neighbor: \t%i\n"
+		   "s_good_search_by_key_reada: \t%i\n"
+		   "s_bmaps: \t%i\n"
+		   "s_bmaps_without_search: \t%i\n"
+		   "s_direct2indirect: \t%i\n"
+		   "s_indirect2direct: \t%i\n"
+		   "\n"
+		   "max_hash_collisions: \t%i\n"
+		   "breads: \t%lu\n"
+		   "bread_misses: \t%lu\n"
+		   "search_by_key: \t%lu\n"
+		   "search_by_key_fs_changed: \t%lu\n"
+		   "search_by_key_restarted: \t%lu\n"
+		   "insert_item_restarted: \t%lu\n"
+		   "paste_into_item_restarted: \t%lu\n"
+		   "cut_from_item_restarted: \t%lu\n"
+		   "delete_solid_item_restarted: \t%lu\n"
+		   "delete_item_restarted: \t%lu\n"
+		   "leaked_oid: \t%lu\n"
+		   "leaves_removable: \t%lu\n",
+		   SF(s_mount_state) == REISERFS_VALID_FS ?
+		   "REISERFS_VALID_FS" : "REISERFS_ERROR_FS",
+		   reiserfs_r5_hash(sb) ? "FORCE_R5 " : "",
+		   reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "",
+		   reiserfs_tea_hash(sb) ? "FORCE_TEA " : "",
+		   reiserfs_hash_detect(sb) ? "DETECT_HASH " : "",
+		   reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ",
+		   reiserfs_no_unhashed_relocation(sb) ?
+		   "NO_UNHASHED_RELOCATION " : "",
+		   reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "",
+		   reiserfs_test4(sb) ? "TEST4 " : "",
+		   have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ?
+		   "SMALL_TAILS " : "NO_TAILS ",
+		   replay_only(sb) ? "REPLAY_ONLY " : "",
+		   convert_reiserfs(sb) ? "CONV " : "",
+		   atomic_read(&r->s_generation_counter), SF(s_kmallocs),
+		   SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes),
+		   SF(s_do_balance), SF(s_unneeded_left_neighbor),
+		   SF(s_good_search_by_key_reada), SF(s_bmaps),
+		   SF(s_bmaps_without_search), SF(s_direct2indirect),
+		   SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads),
+		   SFP(bread_miss), SFP(search_by_key),
+		   SFP(search_by_key_fs_changed), SFP(search_by_key_restarted),
+		   SFP(insert_item_restarted), SFP(paste_into_item_restarted),
+		   SFP(cut_from_item_restarted),
+		   SFP(delete_solid_item_restarted), SFP(delete_item_restarted),
+		   SFP(leaked_oid), SFP(leaves_removable));
 
 	return 0;
 }
@@ -169,61 +149,55 @@ static int show_per_level(struct seq_file *m, struct super_block *sb)
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
 	int level;
 
-	seq_printf(m,	"level\t"
-			"     balances"
-			" [sbk:  reads"
-			"   fs_changed"
-			"   restarted]"
-			"   free space"
-			"        items"
-			"   can_remove"
-			"         lnum"
-			"         rnum"
-			"       lbytes"
-			"       rbytes"
-			"     get_neig"
-			" get_neig_res"
-			"  need_l_neig"
-			"  need_r_neig"
-			"\n"
-			
-		);
-
-	for( level = 0 ; level < MAX_HEIGHT ; ++ level ) {
-		seq_printf(m,	"%i\t"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12li"
-				" %12li"
-				" %12li"
-				" %12li"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				" %12lu"
-				"\n",
-				level, 
-				SFPL( balance_at ),
-				SFPL( sbk_read_at ),
-				SFPL( sbk_fs_changed ),
-				SFPL( sbk_restarted ),
-				SFPL( free_at ),
-				SFPL( items_at ),
-				SFPL( can_node_be_removed ),
-				SFPL( lnum ),
-				SFPL( rnum ),
-				SFPL( lbytes ),
-				SFPL( rbytes ),
-				SFPL( get_neighbors ),
-				SFPL( get_neighbors_restart ),
-				SFPL( need_l_neighbor ),
-				SFPL( need_r_neighbor )
-			);
+	seq_printf(m, "level\t"
+		   "     balances"
+		   " [sbk:  reads"
+		   "   fs_changed"
+		   "   restarted]"
+		   "   free space"
+		   "        items"
+		   "   can_remove"
+		   "         lnum"
+		   "         rnum"
+		   "       lbytes"
+		   "       rbytes"
+		   "     get_neig"
+		   " get_neig_res" "  need_l_neig" "  need_r_neig" "\n");
+
+	for (level = 0; level < MAX_HEIGHT; ++level) {
+		seq_printf(m, "%i\t"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12li"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   " %12lu"
+			   "\n",
+			   level,
+			   SFPL(balance_at),
+			   SFPL(sbk_read_at),
+			   SFPL(sbk_fs_changed),
+			   SFPL(sbk_restarted),
+			   SFPL(free_at),
+			   SFPL(items_at),
+			   SFPL(can_node_be_removed),
+			   SFPL(lnum),
+			   SFPL(rnum),
+			   SFPL(lbytes),
+			   SFPL(rbytes),
+			   SFPL(get_neighbors),
+			   SFPL(get_neighbors_restart),
+			   SFPL(need_l_neighbor), SFPL(need_r_neighbor)
+		    );
 	}
 	return 0;
 }
@@ -232,31 +206,30 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
 {
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
 
-	seq_printf(m,	"free_block: %lu\n"
-			"  scan_bitmap:"
-			"          wait"
-			"          bmap"
-			"         retry"
-			"        stolen"
-			"  journal_hint"
-			"journal_nohint"
-			"\n"
-			" %14lu"
-			" %14lu"
-			" %14lu"
-			" %14lu"
-			" %14lu"
-			" %14lu"
-			" %14lu"
-			"\n",
-			SFP( free_block ),
-			SFPF( call ), 
-			SFPF( wait ), 
-			SFPF( bmap ),
-			SFPF( retry ),
-			SFPF( stolen ),
-			SFPF( in_journal_hint ),
-			SFPF( in_journal_nohint ) );
+	seq_printf(m, "free_block: %lu\n"
+		   "  scan_bitmap:"
+		   "          wait"
+		   "          bmap"
+		   "         retry"
+		   "        stolen"
+		   "  journal_hint"
+		   "journal_nohint"
+		   "\n"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   " %14lu"
+		   "\n",
+		   SFP(free_block),
+		   SFPF(call),
+		   SFPF(wait),
+		   SFPF(bmap),
+		   SFPF(retry),
+		   SFPF(stolen),
+		   SFPF(in_journal_hint), SFPF(in_journal_nohint));
 
 	return 0;
 }
@@ -264,46 +237,42 @@ static int show_bitmap(struct seq_file *m, struct super_block *sb)
 static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
 {
 	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info -> s_rs;
-	int hash_code = DFL( s_hash_function_code );
-	__u32 flags = DJF( s_flags );
-
-	seq_printf(m,	"block_count: \t%i\n"
-			"free_blocks: \t%i\n"
-			"root_block: \t%i\n"
-			"blocksize: \t%i\n"
-			"oid_maxsize: \t%i\n"
-			"oid_cursize: \t%i\n"
-			"umount_state: \t%i\n"
-			"magic: \t%10.10s\n"
-			"fs_state: \t%i\n"
-			"hash: \t%s\n"
-			"tree_height: \t%i\n"
-			"bmap_nr: \t%i\n"
-			"version: \t%i\n"
-			"flags: \t%x[%s]\n"
-			"reserved_for_journal: \t%i\n",
-
-			DFL( s_block_count ),
-			DFL( s_free_blocks ),
-			DFL( s_root_block ),
-			DF( s_blocksize ),
-			DF( s_oid_maxsize ),
-			DF( s_oid_cursize ),
-			DF( s_umount_state ),
-			rs -> s_v1.s_magic,
-			DF( s_fs_state ),
-			hash_code == TEA_HASH ? "tea" :
-			( hash_code == YURA_HASH ) ? "rupasov" :
-			( hash_code == R5_HASH ) ? "r5" :
-			( hash_code == UNSET_HASH ) ? "unset" : "unknown",
-			DF( s_tree_height ),
-			DF( s_bmap_nr ),
-			DF( s_version ),
-			flags,
-			( flags & reiserfs_attrs_cleared )
-			? "attrs_cleared" : "",
-			DF (s_reserved_for_journal));
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	int hash_code = DFL(s_hash_function_code);
+	__u32 flags = DJF(s_flags);
+
+	seq_printf(m, "block_count: \t%i\n"
+		   "free_blocks: \t%i\n"
+		   "root_block: \t%i\n"
+		   "blocksize: \t%i\n"
+		   "oid_maxsize: \t%i\n"
+		   "oid_cursize: \t%i\n"
+		   "umount_state: \t%i\n"
+		   "magic: \t%10.10s\n"
+		   "fs_state: \t%i\n"
+		   "hash: \t%s\n"
+		   "tree_height: \t%i\n"
+		   "bmap_nr: \t%i\n"
+		   "version: \t%i\n"
+		   "flags: \t%x[%s]\n"
+		   "reserved_for_journal: \t%i\n",
+		   DFL(s_block_count),
+		   DFL(s_free_blocks),
+		   DFL(s_root_block),
+		   DF(s_blocksize),
+		   DF(s_oid_maxsize),
+		   DF(s_oid_cursize),
+		   DF(s_umount_state),
+		   rs->s_v1.s_magic,
+		   DF(s_fs_state),
+		   hash_code == TEA_HASH ? "tea" :
+		   (hash_code == YURA_HASH) ? "rupasov" :
+		   (hash_code == R5_HASH) ? "r5" :
+		   (hash_code == UNSET_HASH) ? "unset" : "unknown",
+		   DF(s_tree_height),
+		   DF(s_bmap_nr),
+		   DF(s_version), flags, (flags & reiserfs_attrs_cleared)
+		   ? "attrs_cleared" : "", DF(s_reserved_for_journal));
 
 	return 0;
 }
@@ -311,131 +280,122 @@ static int show_on_disk_super(struct seq_file *m, struct super_block *sb)
 static int show_oidmap(struct seq_file *m, struct super_block *sb)
 {
 	struct reiserfs_sb_info *sb_info = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = sb_info -> s_rs;
-	unsigned int mapsize = le16_to_cpu( rs -> s_v1.s_oid_cursize );
+	struct reiserfs_super_block *rs = sb_info->s_rs;
+	unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize);
 	unsigned long total_used = 0;
 	int i;
 
-	for( i = 0 ; i < mapsize ; ++i ) {
+	for (i = 0; i < mapsize; ++i) {
 		__u32 right;
 
-		right = ( i == mapsize - 1 ) ? MAX_KEY_OBJECTID : MAP( i + 1 );
+		right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1);
 		seq_printf(m, "%s: [ %x .. %x )\n",
-				( i & 1 ) ? "free" : "used", MAP( i ), right );
-		if( ! ( i & 1 ) ) {
-			total_used += right - MAP( i );
+			   (i & 1) ? "free" : "used", MAP(i), right);
+		if (!(i & 1)) {
+			total_used += right - MAP(i);
 		}
 	}
 #if defined( REISERFS_USE_OIDMAPF )
-	if( sb_info -> oidmap.use_file && ( sb_info -> oidmap.mapf != NULL ) ) {
+	if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) {
 		loff_t size = sb_info->oidmap.mapf->f_dentry->d_inode->i_size;
-		total_used += size / sizeof( reiserfs_oidinterval_d_t );
+		total_used += size / sizeof(reiserfs_oidinterval_d_t);
 	}
 #endif
-	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", 
-			mapsize, 
-			mapsize, le16_to_cpu( rs -> s_v1.s_oid_maxsize ),
-			total_used);
+	seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n",
+		   mapsize,
+		   mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used);
 	return 0;
 }
 
 static int show_journal(struct seq_file *m, struct super_block *sb)
 {
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
-	struct reiserfs_super_block *rs = r -> s_rs;
+	struct reiserfs_super_block *rs = r->s_rs;
 	struct journal_params *jp = &rs->s_v1.s_journal;
 	char b[BDEVNAME_SIZE];
-    
-
-	seq_printf(m,	 /* on-disk fields */
- 			"jp_journal_1st_block: \t%i\n"
- 			"jp_journal_dev: \t%s[%x]\n"
- 			"jp_journal_size: \t%i\n"
- 			"jp_journal_trans_max: \t%i\n"
- 			"jp_journal_magic: \t%i\n"
- 			"jp_journal_max_batch: \t%i\n"
- 			"jp_journal_max_commit_age: \t%i\n"
- 			"jp_journal_max_trans_age: \t%i\n"
-			/* incore fields */
-			"j_1st_reserved_block: \t%i\n"	  
-			"j_state: \t%li\n"			
-			"j_trans_id: \t%lu\n"
-			"j_mount_id: \t%lu\n"
-			"j_start: \t%lu\n"
-			"j_len: \t%lu\n"
-			"j_len_alloc: \t%lu\n"
-			"j_wcount: \t%i\n"
-			"j_bcount: \t%lu\n"
-			"j_first_unflushed_offset: \t%lu\n"
-			"j_last_flush_trans_id: \t%lu\n"
-			"j_trans_start_time: \t%li\n"
-			"j_list_bitmap_index: \t%i\n"
-			"j_must_wait: \t%i\n"
-			"j_next_full_flush: \t%i\n"
-			"j_next_async_flush: \t%i\n"
-			"j_cnode_used: \t%i\n"
-			"j_cnode_free: \t%i\n"
-			"\n"
-			/* reiserfs_proc_info_data_t.journal fields */
-			"in_journal: \t%12lu\n"
-			"in_journal_bitmap: \t%12lu\n"
-			"in_journal_reusable: \t%12lu\n"
-			"lock_journal: \t%12lu\n"
-			"lock_journal_wait: \t%12lu\n"
-			"journal_begin: \t%12lu\n"
-			"journal_relock_writers: \t%12lu\n"
-			"journal_relock_wcount: \t%12lu\n"
-			"mark_dirty: \t%12lu\n"
-			"mark_dirty_already: \t%12lu\n"
-			"mark_dirty_notjournal: \t%12lu\n"
-			"restore_prepared: \t%12lu\n"
-			"prepare: \t%12lu\n"
-			"prepare_retry: \t%12lu\n",
-
-                        DJP( jp_journal_1st_block ),
-                        bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
-                        DJP( jp_journal_dev ),
-                        DJP( jp_journal_size ),
-                        DJP( jp_journal_trans_max ),
-                        DJP( jp_journal_magic ),
-                        DJP( jp_journal_max_batch ),
-			SB_JOURNAL(sb)->j_max_commit_age,
-                        DJP( jp_journal_max_trans_age ),
-
-			JF( j_1st_reserved_block ),			
-			JF( j_state ),			
-			JF( j_trans_id ),
-			JF( j_mount_id ),
-			JF( j_start ),
-			JF( j_len ),
-			JF( j_len_alloc ),
-			atomic_read( & r -> s_journal -> j_wcount ),
-			JF( j_bcount ),
-			JF( j_first_unflushed_offset ),
-			JF( j_last_flush_trans_id ),
-			JF( j_trans_start_time ),
-			JF( j_list_bitmap_index ),
-			JF( j_must_wait ),
-			JF( j_next_full_flush ),
-			JF( j_next_async_flush ),
-			JF( j_cnode_used ),
-			JF( j_cnode_free ),
-
-			SFPJ( in_journal ),
-			SFPJ( in_journal_bitmap ),
-			SFPJ( in_journal_reusable ),
-			SFPJ( lock_journal ),
-			SFPJ( lock_journal_wait ),
-			SFPJ( journal_being ),
-			SFPJ( journal_relock_writers ),
-			SFPJ( journal_relock_wcount ),
-			SFPJ( mark_dirty ),
-			SFPJ( mark_dirty_already ),
-			SFPJ( mark_dirty_notjournal ),
-			SFPJ( restore_prepared ),
-			SFPJ( prepare ),
-			SFPJ( prepare_retry )
-		);
+
+	seq_printf(m,		/* on-disk fields */
+		   "jp_journal_1st_block: \t%i\n"
+		   "jp_journal_dev: \t%s[%x]\n"
+		   "jp_journal_size: \t%i\n"
+		   "jp_journal_trans_max: \t%i\n"
+		   "jp_journal_magic: \t%i\n"
+		   "jp_journal_max_batch: \t%i\n"
+		   "jp_journal_max_commit_age: \t%i\n"
+		   "jp_journal_max_trans_age: \t%i\n"
+		   /* incore fields */
+		   "j_1st_reserved_block: \t%i\n"
+		   "j_state: \t%li\n"
+		   "j_trans_id: \t%lu\n"
+		   "j_mount_id: \t%lu\n"
+		   "j_start: \t%lu\n"
+		   "j_len: \t%lu\n"
+		   "j_len_alloc: \t%lu\n"
+		   "j_wcount: \t%i\n"
+		   "j_bcount: \t%lu\n"
+		   "j_first_unflushed_offset: \t%lu\n"
+		   "j_last_flush_trans_id: \t%lu\n"
+		   "j_trans_start_time: \t%li\n"
+		   "j_list_bitmap_index: \t%i\n"
+		   "j_must_wait: \t%i\n"
+		   "j_next_full_flush: \t%i\n"
+		   "j_next_async_flush: \t%i\n"
+		   "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n"
+		   /* reiserfs_proc_info_data_t.journal fields */
+		   "in_journal: \t%12lu\n"
+		   "in_journal_bitmap: \t%12lu\n"
+		   "in_journal_reusable: \t%12lu\n"
+		   "lock_journal: \t%12lu\n"
+		   "lock_journal_wait: \t%12lu\n"
+		   "journal_begin: \t%12lu\n"
+		   "journal_relock_writers: \t%12lu\n"
+		   "journal_relock_wcount: \t%12lu\n"
+		   "mark_dirty: \t%12lu\n"
+		   "mark_dirty_already: \t%12lu\n"
+		   "mark_dirty_notjournal: \t%12lu\n"
+		   "restore_prepared: \t%12lu\n"
+		   "prepare: \t%12lu\n"
+		   "prepare_retry: \t%12lu\n",
+		   DJP(jp_journal_1st_block),
+		   bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+		   DJP(jp_journal_dev),
+		   DJP(jp_journal_size),
+		   DJP(jp_journal_trans_max),
+		   DJP(jp_journal_magic),
+		   DJP(jp_journal_max_batch),
+		   SB_JOURNAL(sb)->j_max_commit_age,
+		   DJP(jp_journal_max_trans_age),
+		   JF(j_1st_reserved_block),
+		   JF(j_state),
+		   JF(j_trans_id),
+		   JF(j_mount_id),
+		   JF(j_start),
+		   JF(j_len),
+		   JF(j_len_alloc),
+		   atomic_read(&r->s_journal->j_wcount),
+		   JF(j_bcount),
+		   JF(j_first_unflushed_offset),
+		   JF(j_last_flush_trans_id),
+		   JF(j_trans_start_time),
+		   JF(j_list_bitmap_index),
+		   JF(j_must_wait),
+		   JF(j_next_full_flush),
+		   JF(j_next_async_flush),
+		   JF(j_cnode_used),
+		   JF(j_cnode_free),
+		   SFPJ(in_journal),
+		   SFPJ(in_journal_bitmap),
+		   SFPJ(in_journal_reusable),
+		   SFPJ(lock_journal),
+		   SFPJ(lock_journal_wait),
+		   SFPJ(journal_being),
+		   SFPJ(journal_relock_writers),
+		   SFPJ(journal_relock_wcount),
+		   SFPJ(mark_dirty),
+		   SFPJ(mark_dirty_already),
+		   SFPJ(mark_dirty_notjournal),
+		   SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry)
+	    );
 	return 0;
 }
 
@@ -450,7 +410,7 @@ static int set_sb(struct super_block *sb, void *data)
 	return -ENOENT;
 }
 
-static void *r_start(struct seq_file *m, loff_t *pos)
+static void *r_start(struct seq_file *m, loff_t * pos)
 {
 	struct proc_dir_entry *de = m->private;
 	struct super_block *s = de->parent->data;
@@ -472,7 +432,7 @@ static void *r_start(struct seq_file *m, loff_t *pos)
 	return s;
 }
 
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static void *r_next(struct seq_file *m, void *v, loff_t * pos)
 {
 	++*pos;
 	if (v)
@@ -489,7 +449,7 @@ static void r_stop(struct seq_file *m, void *v)
 static int r_show(struct seq_file *m, void *v)
 {
 	struct proc_dir_entry *de = m->private;
-	int (*show)(struct seq_file *, struct super_block *) = de->data;
+	int (*show) (struct seq_file *, struct super_block *) = de->data;
 	return show(m, v);
 }
 
@@ -512,17 +472,17 @@ static int r_open(struct inode *inode, struct file *file)
 }
 
 static struct file_operations r_file_operations = {
-	.open		= r_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.open = r_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
 };
 
 static struct proc_dir_entry *proc_info_root = NULL;
 static const char proc_info_root_name[] = "fs/reiserfs";
 
 static void add_file(struct super_block *sb, char *name,
-	int (*func)(struct seq_file *, struct super_block *))
+		     int (*func) (struct seq_file *, struct super_block *))
 {
 	struct proc_dir_entry *de;
 	de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir);
@@ -532,11 +492,12 @@ static void add_file(struct super_block *sb, char *name,
 	}
 }
 
-int reiserfs_proc_info_init( struct super_block *sb )
+int reiserfs_proc_info_init(struct super_block *sb)
 {
-	spin_lock_init( & __PINFO( sb ).lock );
-	REISERFS_SB(sb)->procdir = proc_mkdir(reiserfs_bdevname (sb), proc_info_root);
-	if( REISERFS_SB(sb)->procdir ) {
+	spin_lock_init(&__PINFO(sb).lock);
+	REISERFS_SB(sb)->procdir =
+	    proc_mkdir(reiserfs_bdevname(sb), proc_info_root);
+	if (REISERFS_SB(sb)->procdir) {
 		REISERFS_SB(sb)->procdir->owner = THIS_MODULE;
 		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
@@ -549,11 +510,11 @@ int reiserfs_proc_info_init( struct super_block *sb )
 		return 0;
 	}
 	reiserfs_warning(sb, "reiserfs: cannot create /proc/%s/%s",
-			 proc_info_root_name, reiserfs_bdevname (sb) );
+			 proc_info_root_name, reiserfs_bdevname(sb));
 	return 1;
 }
 
-int reiserfs_proc_info_done( struct super_block *sb )
+int reiserfs_proc_info_done(struct super_block *sb)
 {
 	struct proc_dir_entry *de = REISERFS_SB(sb)->procdir;
 	if (de) {
@@ -565,48 +526,48 @@ int reiserfs_proc_info_done( struct super_block *sb )
 		remove_proc_entry("super", de);
 		remove_proc_entry("version", de);
 	}
-	spin_lock( & __PINFO( sb ).lock );
-	__PINFO( sb ).exiting = 1;
-	spin_unlock( & __PINFO( sb ).lock );
-	if ( proc_info_root ) {
-		remove_proc_entry( reiserfs_bdevname (sb), proc_info_root );
+	spin_lock(&__PINFO(sb).lock);
+	__PINFO(sb).exiting = 1;
+	spin_unlock(&__PINFO(sb).lock);
+	if (proc_info_root) {
+		remove_proc_entry(reiserfs_bdevname(sb), proc_info_root);
 		REISERFS_SB(sb)->procdir = NULL;
 	}
 	return 0;
 }
 
-struct proc_dir_entry *reiserfs_proc_register_global( char *name, 
-						      read_proc_t *func )
+struct proc_dir_entry *reiserfs_proc_register_global(char *name,
+						     read_proc_t * func)
 {
-	return ( proc_info_root ) ? create_proc_read_entry( name, 0, 
-							    proc_info_root, 
-							    func, NULL ) : NULL;
+	return (proc_info_root) ? create_proc_read_entry(name, 0,
+							 proc_info_root,
+							 func, NULL) : NULL;
 }
 
-void reiserfs_proc_unregister_global( const char *name )
+void reiserfs_proc_unregister_global(const char *name)
 {
-	remove_proc_entry( name, proc_info_root );
+	remove_proc_entry(name, proc_info_root);
 }
 
-int reiserfs_proc_info_global_init( void )
+int reiserfs_proc_info_global_init(void)
 {
-	if( proc_info_root == NULL ) {
+	if (proc_info_root == NULL) {
 		proc_info_root = proc_mkdir(proc_info_root_name, NULL);
-		if( proc_info_root ) {
-			proc_info_root -> owner = THIS_MODULE;
+		if (proc_info_root) {
+			proc_info_root->owner = THIS_MODULE;
 		} else {
-			reiserfs_warning (NULL,
-					  "reiserfs: cannot create /proc/%s",
-					  proc_info_root_name );
+			reiserfs_warning(NULL,
+					 "reiserfs: cannot create /proc/%s",
+					 proc_info_root_name);
 			return 1;
 		}
 	}
 	return 0;
 }
 
-int reiserfs_proc_info_global_done( void )
+int reiserfs_proc_info_global_done(void)
 {
-	if ( proc_info_root != NULL ) {
+	if (proc_info_root != NULL) {
 		proc_info_root = NULL;
 		remove_proc_entry(proc_info_root_name, NULL);
 	}
@@ -616,22 +577,40 @@ int reiserfs_proc_info_global_done( void )
 /* REISERFS_PROC_INFO */
 #else
 
-int reiserfs_proc_info_init( struct super_block *sb ) { return 0; }
-int reiserfs_proc_info_done( struct super_block *sb ) { return 0; }
+int reiserfs_proc_info_init(struct super_block *sb)
+{
+	return 0;
+}
+int reiserfs_proc_info_done(struct super_block *sb)
+{
+	return 0;
+}
 
-struct proc_dir_entry *reiserfs_proc_register_global( char *name, 
-						      read_proc_t *func )
-{ return NULL; }
+struct proc_dir_entry *reiserfs_proc_register_global(char *name,
+						     read_proc_t * func)
+{
+	return NULL;
+}
 
-void reiserfs_proc_unregister_global( const char *name ) {;}
+void reiserfs_proc_unregister_global(const char *name)
+{;
+}
 
-int reiserfs_proc_info_global_init( void ) { return 0; }
-int reiserfs_proc_info_global_done( void ) { return 0; }
+int reiserfs_proc_info_global_init(void)
+{
+	return 0;
+}
+int reiserfs_proc_info_global_done(void)
+{
+	return 0;
+}
 
-int reiserfs_global_version_in_proc( char *buffer, char **start, 
-				     off_t offset,
-				     int count, int *eof, void *data )
-{ return 0; }
+int reiserfs_global_version_in_proc(char *buffer, char **start,
+				    off_t offset,
+				    int count, int *eof, void *data)
+{
+	return 0;
+}
 
 /* REISERFS_PROC_INFO */
 #endif
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 170012078b76..39cc7f47f5dc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -1,7 +1,7 @@
 /* 
  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
  */
- 
+
 /* 
  * Written by Alexander Zarochentcev.
  *
@@ -17,23 +17,23 @@
 #include <linux/reiserfs_fs_sb.h>
 #include <linux/buffer_head.h>
 
-int reiserfs_resize (struct super_block * s, unsigned long block_count_new)
+int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
 {
-        int err = 0;
-	struct reiserfs_super_block * sb;
-        struct reiserfs_bitmap_info *bitmap;
+	int err = 0;
+	struct reiserfs_super_block *sb;
+	struct reiserfs_bitmap_info *bitmap;
 	struct reiserfs_bitmap_info *old_bitmap = SB_AP_BITMAP(s);
-	struct buffer_head * bh;
+	struct buffer_head *bh;
 	struct reiserfs_transaction_handle th;
 	unsigned int bmap_nr_new, bmap_nr;
 	unsigned int block_r_new, block_r;
-	
-	struct reiserfs_list_bitmap * jb;
+
+	struct reiserfs_list_bitmap *jb;
 	struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS];
-	
+
 	unsigned long int block_count, free_blocks;
 	int i;
-	int copy_size ;
+	int copy_size;
 
 	sb = SB_DISK_SUPER_BLOCK(s);
 
@@ -47,136 +47,145 @@ int reiserfs_resize (struct super_block * s, unsigned long block_count_new)
 	if (!bh) {
 		printk("reiserfs_resize: can\'t read last block\n");
 		return -EINVAL;
-	}	
+	}
 	bforget(bh);
 
 	/* old disk layout detection; those partitions can be mounted, but
 	 * cannot be resized */
-	if (SB_BUFFER_WITH_SB(s)->b_blocknr *	SB_BUFFER_WITH_SB(s)->b_size 
-		!= REISERFS_DISK_OFFSET_IN_BYTES ) {
-		printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
+	if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size
+	    != REISERFS_DISK_OFFSET_IN_BYTES) {
+		printk
+		    ("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n");
 		return -ENOTSUPP;
 	}
-       
+
 	/* count used bits in last bitmap block */
-	block_r = SB_BLOCK_COUNT(s) -
-	        (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8;
-	
+	block_r = SB_BLOCK_COUNT(s) - (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8;
+
 	/* count bitmap blocks in new fs */
-	bmap_nr_new = block_count_new / ( s->s_blocksize * 8 );
+	bmap_nr_new = block_count_new / (s->s_blocksize * 8);
 	block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8;
-	if (block_r_new) 
+	if (block_r_new)
 		bmap_nr_new++;
 	else
 		block_r_new = s->s_blocksize * 8;
 
 	/* save old values */
 	block_count = SB_BLOCK_COUNT(s);
-	bmap_nr     = SB_BMAP_NR(s);
+	bmap_nr = SB_BMAP_NR(s);
 
 	/* resizing of reiserfs bitmaps (journal and real), if needed */
-	if (bmap_nr_new > bmap_nr) {	    
-	    /* reallocate journal bitmaps */
-	    if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
-		printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
-		unlock_super(s) ;
-		return -ENOMEM ;
-	    }
-	    /* the new journal bitmaps are zero filled, now we copy in the bitmap
-	    ** node pointers from the old journal bitmap structs, and then
-	    ** transfer the new data structures into the journal struct.
-	    **
-	    ** using the copy_size var below allows this code to work for
-	    ** both shrinking and expanding the FS.
-	    */
-	    copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ;
-	    copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ;
-	    for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) {
-		struct reiserfs_bitmap_node **node_tmp ;
-		jb = SB_JOURNAL(s)->j_list_bitmap + i ;
-		memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ;
-
-		/* just in case vfree schedules on us, copy the new
-		** pointer into the journal struct before freeing the 
-		** old one
-		*/
-		node_tmp = jb->bitmaps ;
-		jb->bitmaps = jbitmap[i].bitmaps ;
-		vfree(node_tmp) ;
-	    }	
-	
-	    /* allocate additional bitmap blocks, reallocate array of bitmap
-	     * block pointers */
-	    bitmap = vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
-	    if (!bitmap) {
-		/* Journal bitmaps are still supersized, but the memory isn't
-		 * leaked, so I guess it's ok */
-		printk("reiserfs_resize: unable to allocate memory.\n");
-		return -ENOMEM;
-	    }
-	    memset (bitmap, 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
-	    for (i = 0; i < bmap_nr; i++)
-		bitmap[i] = old_bitmap[i];
-
-	    /* This doesn't go through the journal, but it doesn't have to.
-	     * The changes are still atomic: We're synced up when the journal
-	     * transaction begins, and the new bitmaps don't matter if the
-	     * transaction fails. */
-	    for (i = bmap_nr; i < bmap_nr_new; i++) {
-		bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8);
-		memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb));
-		reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data);
-
-		set_buffer_uptodate(bitmap[i].bh);
-		mark_buffer_dirty(bitmap[i].bh) ;
-		sync_dirty_buffer(bitmap[i].bh);
-		// update bitmap_info stuff
-		bitmap[i].first_zero_hint=1;
-		bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
-	    }	
-	    /* free old bitmap blocks array */
-	    SB_AP_BITMAP(s) = bitmap;
-	    vfree (old_bitmap);
+	if (bmap_nr_new > bmap_nr) {
+		/* reallocate journal bitmaps */
+		if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) {
+			printk
+			    ("reiserfs_resize: unable to allocate memory for journal bitmaps\n");
+			unlock_super(s);
+			return -ENOMEM;
+		}
+		/* the new journal bitmaps are zero filled, now we copy in the bitmap
+		 ** node pointers from the old journal bitmap structs, and then
+		 ** transfer the new data structures into the journal struct.
+		 **
+		 ** using the copy_size var below allows this code to work for
+		 ** both shrinking and expanding the FS.
+		 */
+		copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr;
+		copy_size =
+		    copy_size * sizeof(struct reiserfs_list_bitmap_node *);
+		for (i = 0; i < JOURNAL_NUM_BITMAPS; i++) {
+			struct reiserfs_bitmap_node **node_tmp;
+			jb = SB_JOURNAL(s)->j_list_bitmap + i;
+			memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size);
+
+			/* just in case vfree schedules on us, copy the new
+			 ** pointer into the journal struct before freeing the 
+			 ** old one
+			 */
+			node_tmp = jb->bitmaps;
+			jb->bitmaps = jbitmap[i].bitmaps;
+			vfree(node_tmp);
+		}
+
+		/* allocate additional bitmap blocks, reallocate array of bitmap
+		 * block pointers */
+		bitmap =
+		    vmalloc(sizeof(struct reiserfs_bitmap_info) * bmap_nr_new);
+		if (!bitmap) {
+			/* Journal bitmaps are still supersized, but the memory isn't
+			 * leaked, so I guess it's ok */
+			printk("reiserfs_resize: unable to allocate memory.\n");
+			return -ENOMEM;
+		}
+		memset(bitmap, 0,
+		       sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
+		for (i = 0; i < bmap_nr; i++)
+			bitmap[i] = old_bitmap[i];
+
+		/* This doesn't go through the journal, but it doesn't have to.
+		 * The changes are still atomic: We're synced up when the journal
+		 * transaction begins, and the new bitmaps don't matter if the
+		 * transaction fails. */
+		for (i = bmap_nr; i < bmap_nr_new; i++) {
+			bitmap[i].bh = sb_getblk(s, i * s->s_blocksize * 8);
+			memset(bitmap[i].bh->b_data, 0, sb_blocksize(sb));
+			reiserfs_test_and_set_le_bit(0, bitmap[i].bh->b_data);
+
+			set_buffer_uptodate(bitmap[i].bh);
+			mark_buffer_dirty(bitmap[i].bh);
+			sync_dirty_buffer(bitmap[i].bh);
+			// update bitmap_info stuff
+			bitmap[i].first_zero_hint = 1;
+			bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
+		}
+		/* free old bitmap blocks array */
+		SB_AP_BITMAP(s) = bitmap;
+		vfree(old_bitmap);
 	}
-	
+
 	/* begin transaction, if there was an error, it's fine. Yes, we have
 	 * incorrect bitmaps now, but none of it is ever going to touch the
 	 * disk anyway. */
 	err = journal_begin(&th, s, 10);
 	if (err)
-	    return err;
+		return err;
 
 	/* correct last bitmap blocks in old and new disk layout */
 	reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1].bh, 1);
 	for (i = block_r; i < s->s_blocksize * 8; i++)
-	    reiserfs_test_and_clear_le_bit(i, 
-					   SB_AP_BITMAP(s)[bmap_nr - 1].bh->b_data);
+		reiserfs_test_and_clear_le_bit(i,
+					       SB_AP_BITMAP(s)[bmap_nr -
+							       1].bh->b_data);
 	SB_AP_BITMAP(s)[bmap_nr - 1].free_count += s->s_blocksize * 8 - block_r;
-	if ( !SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint)
-	    SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r;
+	if (!SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint)
+		SB_AP_BITMAP(s)[bmap_nr - 1].first_zero_hint = block_r;
 
 	journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1].bh);
 
 	reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh, 1);
 	for (i = block_r_new; i < s->s_blocksize * 8; i++)
-	    reiserfs_test_and_set_le_bit(i,
-					 SB_AP_BITMAP(s)[bmap_nr_new - 1].bh->b_data);
+		reiserfs_test_and_set_le_bit(i,
+					     SB_AP_BITMAP(s)[bmap_nr_new -
+							     1].bh->b_data);
 	journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1].bh);
- 
-	SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -= s->s_blocksize * 8 - block_r_new;
+
+	SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count -=
+	    s->s_blocksize * 8 - block_r_new;
 	/* Extreme case where last bitmap is the only valid block in itself. */
-	if ( !SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count )
-	    SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0;
- 	/* update super */
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
+	if (!SB_AP_BITMAP(s)[bmap_nr_new - 1].free_count)
+		SB_AP_BITMAP(s)[bmap_nr_new - 1].first_zero_hint = 0;
+	/* update super */
+	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
 	free_blocks = SB_FREE_BLOCKS(s);
-	PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr)));
+	PUT_SB_FREE_BLOCKS(s,
+			   free_blocks + (block_count_new - block_count -
+					  (bmap_nr_new - bmap_nr)));
 	PUT_SB_BLOCK_COUNT(s, block_count_new);
 	PUT_SB_BMAP_NR(s, bmap_nr_new);
 	s->s_dirt = 1;
 
 	journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
-	
+
 	SB_JOURNAL(s)->j_must_wait = 1;
 	return journal_end(&th, s, 10);
 }
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 63158491e152..e2d08d7bcffc 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -59,46 +59,45 @@
 #include <linux/quotaops.h>
 
 /* Does the buffer contain a disk block which is in the tree. */
-inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh)
+inline int B_IS_IN_TREE(const struct buffer_head *p_s_bh)
 {
 
-  RFALSE( B_LEVEL (p_s_bh) > MAX_HEIGHT,
-	  "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
+	RFALSE(B_LEVEL(p_s_bh) > MAX_HEIGHT,
+	       "PAP-1010: block (%b) has too big level (%z)", p_s_bh, p_s_bh);
 
-  return ( B_LEVEL (p_s_bh) != FREE_LEVEL );
+	return (B_LEVEL(p_s_bh) != FREE_LEVEL);
 }
 
 //
 // to gets item head in le form
 //
-inline void copy_item_head(struct item_head * p_v_to, 
-			   const struct item_head * p_v_from)
+inline void copy_item_head(struct item_head *p_v_to,
+			   const struct item_head *p_v_from)
 {
-  memcpy (p_v_to, p_v_from, IH_SIZE);
+	memcpy(p_v_to, p_v_from, IH_SIZE);
 }
 
-
 /* k1 is pointer to on-disk structure which is stored in little-endian
    form. k2 is pointer to cpu variable. For key of items of the same
    object this returns 0.
    Returns: -1 if key1 < key2 
    0 if key1 == key2
    1 if key1 > key2 */
-inline int  comp_short_keys (const struct reiserfs_key * le_key,
-			     const struct cpu_key * cpu_key)
+inline int comp_short_keys(const struct reiserfs_key *le_key,
+			   const struct cpu_key *cpu_key)
 {
-  __u32 n;
-  n = le32_to_cpu(le_key->k_dir_id);
-  if (n < cpu_key->on_disk_key.k_dir_id)
-      return -1;
-  if (n > cpu_key->on_disk_key.k_dir_id)
-      return 1;
-  n = le32_to_cpu(le_key->k_objectid);
-  if (n < cpu_key->on_disk_key.k_objectid)
-      return -1;
-  if (n > cpu_key->on_disk_key.k_objectid)
-      return 1;
-  return 0;
+	__u32 n;
+	n = le32_to_cpu(le_key->k_dir_id);
+	if (n < cpu_key->on_disk_key.k_dir_id)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_dir_id)
+		return 1;
+	n = le32_to_cpu(le_key->k_objectid);
+	if (n < cpu_key->on_disk_key.k_objectid)
+		return -1;
+	if (n > cpu_key->on_disk_key.k_objectid)
+		return 1;
+	return 0;
 }
 
 /* k1 is pointer to on-disk structure which is stored in little-endian
@@ -106,68 +105,72 @@ inline int  comp_short_keys (const struct reiserfs_key * le_key,
    Compare keys using all 4 key fields.
    Returns: -1 if key1 < key2 0
    if key1 = key2 1 if key1 > key2 */
-static inline int  comp_keys (const struct reiserfs_key * le_key, const struct cpu_key * cpu_key)
+static inline int comp_keys(const struct reiserfs_key *le_key,
+			    const struct cpu_key *cpu_key)
 {
-  int retval;
-
-  retval = comp_short_keys (le_key, cpu_key);
-  if (retval)
-      return retval;
-  if (le_key_k_offset (le_key_version(le_key), le_key) < cpu_key_k_offset (cpu_key))
-      return -1;
-  if (le_key_k_offset (le_key_version(le_key), le_key) > cpu_key_k_offset (cpu_key))
-      return 1;
-
-  if (cpu_key->key_length == 3)
-      return 0;
-
-  /* this part is needed only when tail conversion is in progress */
-  if (le_key_k_type (le_key_version(le_key), le_key) < cpu_key_k_type (cpu_key))
-    return -1;
+	int retval;
+
+	retval = comp_short_keys(le_key, cpu_key);
+	if (retval)
+		return retval;
+	if (le_key_k_offset(le_key_version(le_key), le_key) <
+	    cpu_key_k_offset(cpu_key))
+		return -1;
+	if (le_key_k_offset(le_key_version(le_key), le_key) >
+	    cpu_key_k_offset(cpu_key))
+		return 1;
+
+	if (cpu_key->key_length == 3)
+		return 0;
+
+	/* this part is needed only when tail conversion is in progress */
+	if (le_key_k_type(le_key_version(le_key), le_key) <
+	    cpu_key_k_type(cpu_key))
+		return -1;
+
+	if (le_key_k_type(le_key_version(le_key), le_key) >
+	    cpu_key_k_type(cpu_key))
+		return 1;
 
-  if (le_key_k_type (le_key_version(le_key), le_key) > cpu_key_k_type (cpu_key))
-    return 1;
-
-  return 0;
+	return 0;
 }
 
-
-inline int comp_short_le_keys (const struct reiserfs_key * key1, const struct reiserfs_key * key2)
+inline int comp_short_le_keys(const struct reiserfs_key *key1,
+			      const struct reiserfs_key *key2)
 {
-  __u32 * p_s_1_u32, * p_s_2_u32;
-  int n_key_length = REISERFS_SHORT_KEY_LEN;
-
-  p_s_1_u32 = (__u32 *)key1;
-  p_s_2_u32 = (__u32 *)key2;
-  for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) {
-    if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) )
-      return -1;
-    if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) )
-      return 1;
-  }
-  return 0;
+	__u32 *p_s_1_u32, *p_s_2_u32;
+	int n_key_length = REISERFS_SHORT_KEY_LEN;
+
+	p_s_1_u32 = (__u32 *) key1;
+	p_s_2_u32 = (__u32 *) key2;
+	for (; n_key_length--; ++p_s_1_u32, ++p_s_2_u32) {
+		if (le32_to_cpu(*p_s_1_u32) < le32_to_cpu(*p_s_2_u32))
+			return -1;
+		if (le32_to_cpu(*p_s_1_u32) > le32_to_cpu(*p_s_2_u32))
+			return 1;
+	}
+	return 0;
 }
 
-inline void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from)
+inline void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from)
 {
-    int version;
-    to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id);
-    to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid);
-    
-    // find out version of the key
-    version = le_key_version (from);
-    to->version = version;
-    to->on_disk_key.k_offset = le_key_k_offset(version, from);
-    to->on_disk_key.k_type = le_key_k_type(version, from);
+	int version;
+	to->on_disk_key.k_dir_id = le32_to_cpu(from->k_dir_id);
+	to->on_disk_key.k_objectid = le32_to_cpu(from->k_objectid);
+
+	// find out version of the key
+	version = le_key_version(from);
+	to->version = version;
+	to->on_disk_key.k_offset = le_key_k_offset(version, from);
+	to->on_disk_key.k_type = le_key_k_type(version, from);
 }
 
-
-
 // this does not say which one is bigger, it only returns 1 if keys
 // are not equal, 0 otherwise
-inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_key * k2)
+inline int comp_le_keys(const struct reiserfs_key *k1,
+			const struct reiserfs_key *k2)
 {
-    return memcmp (k1, k2, sizeof (struct reiserfs_key));
+	return memcmp(k1, k2, sizeof(struct reiserfs_key));
 }
 
 /**************************************************************************
@@ -184,373 +187,396 @@ inline int comp_le_keys (const struct reiserfs_key * k1, const struct reiserfs_k
  there are no possible items, and we have not found it. With each examination we
  cut the number of possible items it could be by one more than half rounded down,
  or we find it. */
-static inline	int bin_search (
-              const void * p_v_key, /* Key to search for.                   */
-	      const void * p_v_base,/* First item in the array.             */
-	      int       p_n_num,    /* Number of items in the array.        */
-	      int       p_n_width,  /* Item size in the array.
-				       searched. Lest the reader be
-				       confused, note that this is crafted
-				       as a general function, and when it
-				       is applied specifically to the array
-				       of item headers in a node, p_n_width
-				       is actually the item header size not
-				       the item size.                      */
-	      int     * p_n_pos     /* Number of the searched for element. */
-            ) {
-    int   n_rbound, n_lbound, n_j;
-
-   for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 )
-     switch( comp_keys((struct reiserfs_key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) )  {
-     case -1: n_lbound = n_j + 1; continue;
-     case  1: n_rbound = n_j - 1; continue;
-     case  0: *p_n_pos = n_j;     return ITEM_FOUND; /* Key found in the array.  */
-        }
-
-    /* bin_search did not find given key, it returns position of key,
-        that is minimal and greater than the given one. */
-    *p_n_pos = n_lbound;
-    return ITEM_NOT_FOUND;
+static inline int bin_search(const void *p_v_key,	/* Key to search for.                   */
+			     const void *p_v_base,	/* First item in the array.             */
+			     int p_n_num,	/* Number of items in the array.        */
+			     int p_n_width,	/* Item size in the array.
+						   searched. Lest the reader be
+						   confused, note that this is crafted
+						   as a general function, and when it
+						   is applied specifically to the array
+						   of item headers in a node, p_n_width
+						   is actually the item header size not
+						   the item size.                      */
+			     int *p_n_pos	/* Number of the searched for element. */
+    )
+{
+	int n_rbound, n_lbound, n_j;
+
+	for (n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0)) / 2;
+	     n_lbound <= n_rbound; n_j = (n_rbound + n_lbound) / 2)
+		switch (comp_keys
+			((struct reiserfs_key *)((char *)p_v_base +
+						 n_j * p_n_width),
+			 (struct cpu_key *)p_v_key)) {
+		case -1:
+			n_lbound = n_j + 1;
+			continue;
+		case 1:
+			n_rbound = n_j - 1;
+			continue;
+		case 0:
+			*p_n_pos = n_j;
+			return ITEM_FOUND;	/* Key found in the array.  */
+		}
+
+	/* bin_search did not find given key, it returns position of key,
+	   that is minimal and greater than the given one. */
+	*p_n_pos = n_lbound;
+	return ITEM_NOT_FOUND;
 }
 
 #ifdef CONFIG_REISERFS_CHECK
-extern struct tree_balance * cur_tb;
+extern struct tree_balance *cur_tb;
 #endif
 
-
-
 /* Minimal possible key. It is never in the tree. */
-const struct reiserfs_key  MIN_KEY = {0, 0, {{0, 0},}};
+const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
 
 /* Maximal possible key. It is never in the tree. */
-static const struct reiserfs_key  MAX_KEY = {
+static const struct reiserfs_key MAX_KEY = {
 	__constant_cpu_to_le32(0xffffffff),
 	__constant_cpu_to_le32(0xffffffff),
 	{{__constant_cpu_to_le32(0xffffffff),
-	__constant_cpu_to_le32(0xffffffff)},}
+	  __constant_cpu_to_le32(0xffffffff)},}
 };
 
-
 /* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom
    of the path, and going upwards.  We must check the path's validity at each step.  If the key is not in
    the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this
    case we return a special key, either MIN_KEY or MAX_KEY. */
-static inline	const struct  reiserfs_key * get_lkey  (
-	                const struct path         * p_s_chk_path,
-                        const struct super_block  * p_s_sb
-                      ) {
-  int                   n_position, n_path_offset = p_s_chk_path->path_length;
-  struct buffer_head  * p_s_parent;
-  
-  RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET, 
-	  "PAP-5010: invalid offset in the path");
-
-  /* While not higher in path than first element. */
-  while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
-
-    RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
-	    "PAP-5020: parent is not uptodate");
-
-    /* Parent at the path is not in the tree now. */
-    if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
-      return &MAX_KEY;
-    /* Check whether position in the parent is correct. */
-    if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
-       return &MAX_KEY;
-    /* Check whether parent at the path really points to the child. */
-    if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
-	 PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
-      return &MAX_KEY;
-    /* Return delimiting key if position in the parent is not equal to zero. */
-    if ( n_position )
-      return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
-  }
-  /* Return MIN_KEY if we are in the root of the buffer tree. */
-  if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-       SB_ROOT_BLOCK (p_s_sb) )
-    return &MIN_KEY;
-  return  &MAX_KEY;
+static inline const struct reiserfs_key *get_lkey(const struct path
+						  *p_s_chk_path,
+						  const struct super_block
+						  *p_s_sb)
+{
+	int n_position, n_path_offset = p_s_chk_path->path_length;
+	struct buffer_head *p_s_parent;
+
+	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5010: invalid offset in the path");
+
+	/* While not higher in path than first element. */
+	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       "PAP-5020: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (p_s_parent =
+		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+			return &MAX_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((n_position =
+		     PATH_OFFSET_POSITION(p_s_chk_path,
+					  n_path_offset)) >
+		    B_NR_ITEMS(p_s_parent))
+			return &MAX_KEY;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(p_s_chk_path,
+					n_path_offset + 1)->b_blocknr)
+			return &MAX_KEY;
+		/* Return delimiting key if position in the parent is not equal to zero. */
+		if (n_position)
+			return B_N_PDELIM_KEY(p_s_parent, n_position - 1);
+	}
+	/* Return MIN_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+		return &MIN_KEY;
+	return &MAX_KEY;
 }
 
-
 /* Get delimiting key of the buffer at the path and its right neighbor. */
-inline	const struct  reiserfs_key * get_rkey  (
-	                const struct path         * p_s_chk_path,
-                        const struct super_block  * p_s_sb
-                      ) {
-  int                   n_position,
-    			n_path_offset = p_s_chk_path->path_length;
-  struct buffer_head  * p_s_parent;
-
-  RFALSE( n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
-	  "PAP-5030: invalid offset in the path");
-
-  while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) {
-
-    RFALSE( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
-	    "PAP-5040: parent is not uptodate");
-
-    /* Parent at the path is not in the tree now. */
-    if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) )
-      return &MIN_KEY;
-    /* Check whether position in the parent is correct. */
-    if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) )
-      return &MIN_KEY;
-    /* Check whether parent at the path really points to the child. */
-    if ( B_N_CHILD_NUM(p_s_parent, n_position) !=
-                                        PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr )
-      return &MIN_KEY;
-    /* Return delimiting key if position in the parent is not the last one. */
-    if ( n_position != B_NR_ITEMS(p_s_parent) )
-      return B_N_PDELIM_KEY(p_s_parent, n_position);
-  }
-  /* Return MAX_KEY if we are in the root of the buffer tree. */
-  if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
-       SB_ROOT_BLOCK (p_s_sb) )
-    return &MAX_KEY;
-  return  &MIN_KEY;
+inline const struct reiserfs_key *get_rkey(const struct path *p_s_chk_path,
+					   const struct super_block *p_s_sb)
+{
+	int n_position, n_path_offset = p_s_chk_path->path_length;
+	struct buffer_head *p_s_parent;
+
+	RFALSE(n_path_offset < FIRST_PATH_ELEMENT_OFFSET,
+	       "PAP-5030: invalid offset in the path");
+
+	while (n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET) {
+
+		RFALSE(!buffer_uptodate
+		       (PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)),
+		       "PAP-5040: parent is not uptodate");
+
+		/* Parent at the path is not in the tree now. */
+		if (!B_IS_IN_TREE
+		    (p_s_parent =
+		     PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)))
+			return &MIN_KEY;
+		/* Check whether position in the parent is correct. */
+		if ((n_position =
+		     PATH_OFFSET_POSITION(p_s_chk_path,
+					  n_path_offset)) >
+		    B_NR_ITEMS(p_s_parent))
+			return &MIN_KEY;
+		/* Check whether parent at the path really points to the child. */
+		if (B_N_CHILD_NUM(p_s_parent, n_position) !=
+		    PATH_OFFSET_PBUFFER(p_s_chk_path,
+					n_path_offset + 1)->b_blocknr)
+			return &MIN_KEY;
+		/* Return delimiting key if position in the parent is not the last one. */
+		if (n_position != B_NR_ITEMS(p_s_parent))
+			return B_N_PDELIM_KEY(p_s_parent, n_position);
+	}
+	/* Return MAX_KEY if we are in the root of the buffer tree. */
+	if (PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->
+	    b_blocknr == SB_ROOT_BLOCK(p_s_sb))
+		return &MAX_KEY;
+	return &MIN_KEY;
 }
 
-
 /* Check whether a key is contained in the tree rooted from a buffer at a path. */
 /* This works by looking at the left and right delimiting keys for the buffer in the last path_element in
    the path.  These delimiting keys are stored at least one level above that buffer in the tree. If the
    buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in
    this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */
-static  inline  int key_in_buffer (
-                      struct path         * p_s_chk_path, /* Path which should be checked.  */
-                      const struct cpu_key      * p_s_key,      /* Key which should be checked.   */
-                      struct super_block  * p_s_sb        /* Super block pointer.           */
-		      ) {
-
-  RFALSE( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET ||
-	  p_s_chk_path->path_length > MAX_HEIGHT,
-	  "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
-	  p_s_key, p_s_chk_path->path_length);
-  RFALSE( !PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
-	  "PAP-5060: device must not be NODEV");
-
-  if ( comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 )
-    /* left delimiting key is bigger, that the key we look for */
-    return 0;
-  //  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
-  if ( comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 )
-    /* p_s_key must be less than right delimitiing key */
-    return 0;
-  return 1;
-}
-
+static inline int key_in_buffer(struct path *p_s_chk_path,	/* Path which should be checked.  */
+				const struct cpu_key *p_s_key,	/* Key which should be checked.   */
+				struct super_block *p_s_sb	/* Super block pointer.           */
+    )
+{
 
-inline void decrement_bcount(
-              struct buffer_head  * p_s_bh
-            ) { 
-  if ( p_s_bh ) {
-    if ( atomic_read (&(p_s_bh->b_count)) ) {
-      put_bh(p_s_bh) ;
-      return;
-    }
-    reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh);
-  }
+	RFALSE(!p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET
+	       || p_s_chk_path->path_length > MAX_HEIGHT,
+	       "PAP-5050: pointer to the key(%p) is NULL or invalid path length(%d)",
+	       p_s_key, p_s_chk_path->path_length);
+	RFALSE(!PATH_PLAST_BUFFER(p_s_chk_path)->b_bdev,
+	       "PAP-5060: device must not be NODEV");
+
+	if (comp_keys(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1)
+		/* left delimiting key is bigger, that the key we look for */
+		return 0;
+	//  if ( comp_keys(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 )
+	if (comp_keys(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1)
+		/* p_s_key must be less than right delimitiing key */
+		return 0;
+	return 1;
 }
 
+inline void decrement_bcount(struct buffer_head *p_s_bh)
+{
+	if (p_s_bh) {
+		if (atomic_read(&(p_s_bh->b_count))) {
+			put_bh(p_s_bh);
+			return;
+		}
+		reiserfs_panic(NULL,
+			       "PAP-5070: decrement_bcount: trying to free free buffer %b",
+			       p_s_bh);
+	}
+}
 
 /* Decrement b_count field of the all buffers in the path. */
-void decrement_counters_in_path (
-              struct path * p_s_search_path
-            ) {
-  int n_path_offset = p_s_search_path->path_length;
-
-  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
-	  n_path_offset > EXTENDED_MAX_HEIGHT - 1,
-	  "PAP-5080: invalid path offset of %d", n_path_offset);
-
-  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) {
-    struct buffer_head * bh;
-
-    bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
-    decrement_bcount (bh);
-  }
-  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
+void decrement_counters_in_path(struct path *p_s_search_path)
+{
+	int n_path_offset = p_s_search_path->path_length;
+
+	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ||
+	       n_path_offset > EXTENDED_MAX_HEIGHT - 1,
+	       "PAP-5080: invalid path offset of %d", n_path_offset);
 
+	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
+		struct buffer_head *bh;
 
-int reiserfs_check_path(struct path *p) {
-  RFALSE( p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
-	  "path not properly relsed") ;
-  return 0 ;
+		bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--);
+		decrement_bcount(bh);
+	}
+	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
+int reiserfs_check_path(struct path *p)
+{
+	RFALSE(p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "path not properly relsed");
+	return 0;
+}
 
 /* Release all buffers in the path. Restore dirty bits clean
 ** when preparing the buffer for the log
 **
 ** only called from fix_nodes()
 */
-void  pathrelse_and_restore (
-	struct super_block *s, 
-        struct path * p_s_search_path
-      ) {
-  int n_path_offset = p_s_search_path->path_length;
-
-  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET, 
-	  "clm-4000: invalid path offset");
-  
-  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET )  {
-    reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path, 
-                                     n_path_offset));
-    brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
-  }
-  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+void pathrelse_and_restore(struct super_block *s, struct path *p_s_search_path)
+{
+	int n_path_offset = p_s_search_path->path_length;
+
+	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "clm-4000: invalid path offset");
+
+	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET) {
+		reiserfs_restore_prepared_buffer(s,
+						 PATH_OFFSET_PBUFFER
+						 (p_s_search_path,
+						  n_path_offset));
+		brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
+	}
+	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
 }
 
 /* Release all buffers in the path. */
-void  pathrelse (
-        struct path * p_s_search_path
-      ) {
-  int n_path_offset = p_s_search_path->path_length;
-
-  RFALSE( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
-	  "PAP-5090: invalid path offset");
-  
-  while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET )  
-    brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
-
-  p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
-}
+void pathrelse(struct path *p_s_search_path)
+{
+	int n_path_offset = p_s_search_path->path_length;
 
+	RFALSE(n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET,
+	       "PAP-5090: invalid path offset");
 
+	while (n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET)
+		brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--));
 
-static int is_leaf (char * buf, int blocksize, struct buffer_head * bh)
-{
-    struct block_head * blkh;
-    struct item_head * ih;
-    int used_space;
-    int prev_location;
-    int i;
-    int nr;
-
-    blkh = (struct block_head *)buf;
-    if ( blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
-	reiserfs_warning (NULL, "is_leaf: this should be caught earlier");
-	return 0;
-    }
+	p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET;
+}
 
-    nr = blkh_nr_item(blkh);
-    if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
-	/* item number is too big or too small */
-	reiserfs_warning (NULL, "is_leaf: nr_item seems wrong: %z", bh);
-	return 0;
-    }
-    ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
-    used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih));
-    if (used_space != blocksize - blkh_free_space(blkh)) {
-	/* free space does not match to calculated amount of use space */
-	reiserfs_warning (NULL, "is_leaf: free space seems wrong: %z", bh);
-	return 0;
-    }
-
-    // FIXME: it is_leaf will hit performance too much - we may have
-    // return 1 here
-
-    /* check tables of item heads */
-    ih = (struct item_head *)(buf + BLKH_SIZE);
-    prev_location = blocksize;
-    for (i = 0; i < nr; i ++, ih ++) {
-	if ( le_ih_k_type(ih) == TYPE_ANY) {
-	    reiserfs_warning (NULL, "is_leaf: wrong item type for item %h",ih);
-	    return 0;
+static int is_leaf(char *buf, int blocksize, struct buffer_head *bh)
+{
+	struct block_head *blkh;
+	struct item_head *ih;
+	int used_space;
+	int prev_location;
+	int i;
+	int nr;
+
+	blkh = (struct block_head *)buf;
+	if (blkh_level(blkh) != DISK_LEAF_NODE_LEVEL) {
+		reiserfs_warning(NULL,
+				 "is_leaf: this should be caught earlier");
+		return 0;
 	}
-	if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) {
-	    reiserfs_warning (NULL, "is_leaf: item location seems wrong: %h", ih);
-	    return 0;
+
+	nr = blkh_nr_item(blkh);
+	if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) {
+		/* item number is too big or too small */
+		reiserfs_warning(NULL, "is_leaf: nr_item seems wrong: %z", bh);
+		return 0;
 	}
-	if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) {
-	    reiserfs_warning (NULL, "is_leaf: item length seems wrong: %h", ih);
-	    return 0;
+	ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1;
+	used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location(ih));
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		/* free space does not match to calculated amount of use space */
+		reiserfs_warning(NULL, "is_leaf: free space seems wrong: %z",
+				 bh);
+		return 0;
 	}
-	if (prev_location - ih_location (ih) != ih_item_len (ih)) {
-	    reiserfs_warning (NULL, "is_leaf: item location seems wrong (second one): %h", ih);
-	    return 0;
+	// FIXME: it is_leaf will hit performance too much - we may have
+	// return 1 here
+
+	/* check tables of item heads */
+	ih = (struct item_head *)(buf + BLKH_SIZE);
+	prev_location = blocksize;
+	for (i = 0; i < nr; i++, ih++) {
+		if (le_ih_k_type(ih) == TYPE_ANY) {
+			reiserfs_warning(NULL,
+					 "is_leaf: wrong item type for item %h",
+					 ih);
+			return 0;
+		}
+		if (ih_location(ih) >= blocksize
+		    || ih_location(ih) < IH_SIZE * nr) {
+			reiserfs_warning(NULL,
+					 "is_leaf: item location seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (ih_item_len(ih) < 1
+		    || ih_item_len(ih) > MAX_ITEM_LEN(blocksize)) {
+			reiserfs_warning(NULL,
+					 "is_leaf: item length seems wrong: %h",
+					 ih);
+			return 0;
+		}
+		if (prev_location - ih_location(ih) != ih_item_len(ih)) {
+			reiserfs_warning(NULL,
+					 "is_leaf: item location seems wrong (second one): %h",
+					 ih);
+			return 0;
+		}
+		prev_location = ih_location(ih);
 	}
-	prev_location = ih_location (ih);
-    }
 
-    // one may imagine much more checks
-    return 1;
+	// one may imagine much more checks
+	return 1;
 }
 
-
 /* returns 1 if buf looks like an internal node, 0 otherwise */
-static int is_internal (char * buf, int blocksize, struct buffer_head * bh)
+static int is_internal(char *buf, int blocksize, struct buffer_head *bh)
 {
-    struct block_head * blkh;
-    int nr;
-    int used_space;
-
-    blkh = (struct block_head *)buf;
-    nr = blkh_level(blkh);
-    if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
-	/* this level is not possible for internal nodes */
-	reiserfs_warning (NULL, "is_internal: this should be caught earlier");
-	return 0;
-    }
-    
-    nr = blkh_nr_item(blkh);
-    if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
-	/* for internal which is not root we might check min number of keys */
-	reiserfs_warning (NULL, "is_internal: number of key seems wrong: %z", bh);
-	return 0;
-    }
+	struct block_head *blkh;
+	int nr;
+	int used_space;
+
+	blkh = (struct block_head *)buf;
+	nr = blkh_level(blkh);
+	if (nr <= DISK_LEAF_NODE_LEVEL || nr > MAX_HEIGHT) {
+		/* this level is not possible for internal nodes */
+		reiserfs_warning(NULL,
+				 "is_internal: this should be caught earlier");
+		return 0;
+	}
 
-    used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
-    if (used_space != blocksize - blkh_free_space(blkh)) {
-	reiserfs_warning (NULL, "is_internal: free space seems wrong: %z", bh);
-	return 0;
-    }
+	nr = blkh_nr_item(blkh);
+	if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) {
+		/* for internal which is not root we might check min number of keys */
+		reiserfs_warning(NULL,
+				 "is_internal: number of key seems wrong: %z",
+				 bh);
+		return 0;
+	}
 
-    // one may imagine much more checks
-    return 1;
+	used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1);
+	if (used_space != blocksize - blkh_free_space(blkh)) {
+		reiserfs_warning(NULL,
+				 "is_internal: free space seems wrong: %z", bh);
+		return 0;
+	}
+	// one may imagine much more checks
+	return 1;
 }
 
-
 // make sure that bh contains formatted node of reiserfs tree of
 // 'level'-th level
-static int is_tree_node (struct buffer_head * bh, int level)
+static int is_tree_node(struct buffer_head *bh, int level)
 {
-    if (B_LEVEL (bh) != level) {
-	reiserfs_warning (NULL, "is_tree_node: node level %d does not match to the expected one %d",
-		B_LEVEL (bh), level);
-	return 0;
-    }
-    if (level == DISK_LEAF_NODE_LEVEL)
-	return is_leaf (bh->b_data, bh->b_size, bh);
+	if (B_LEVEL(bh) != level) {
+		reiserfs_warning(NULL,
+				 "is_tree_node: node level %d does not match to the expected one %d",
+				 B_LEVEL(bh), level);
+		return 0;
+	}
+	if (level == DISK_LEAF_NODE_LEVEL)
+		return is_leaf(bh->b_data, bh->b_size, bh);
 
-    return is_internal (bh->b_data, bh->b_size, bh);
+	return is_internal(bh->b_data, bh->b_size, bh);
 }
 
-
-
 #define SEARCH_BY_KEY_READA 16
 
 /* The function is NOT SCHEDULE-SAFE! */
-static void search_by_key_reada (struct super_block * s,
-                                 struct buffer_head **bh,
-				 unsigned long *b, int num)
+static void search_by_key_reada(struct super_block *s,
+				struct buffer_head **bh,
+				unsigned long *b, int num)
 {
-    int i,j;
-  
-    for (i = 0 ; i < num ; i++) {
-	bh[i] = sb_getblk (s, b[i]);
-    }
-    for (j = 0 ; j < i ; j++) {
-	/*
-	 * note, this needs attention if we are getting rid of the BKL
-	 * you have to make sure the prepared bit isn't set on this buffer
-	 */
-	if (!buffer_uptodate(bh[j]))
-	    ll_rw_block(READA, 1, bh + j);
-    	brelse(bh[j]);
-    }
+	int i, j;
+
+	for (i = 0; i < num; i++) {
+		bh[i] = sb_getblk(s, b[i]);
+	}
+	for (j = 0; j < i; j++) {
+		/*
+		 * note, this needs attention if we are getting rid of the BKL
+		 * you have to make sure the prepared bit isn't set on this buffer
+		 */
+		if (!buffer_uptodate(bh[j]))
+			ll_rw_block(READA, 1, bh + j);
+		brelse(bh[j]);
+	}
 }
 
 /**************************************************************************
@@ -576,194 +602,200 @@ static void search_by_key_reada (struct super_block * s,
    correctness of the top of the path but need not be checked for the
    correctness of the bottom of the path */
 /* The function is NOT SCHEDULE-SAFE! */
-int search_by_key (struct super_block * p_s_sb,
-		   const struct cpu_key * p_s_key, /* Key to search. */
-		   struct path * p_s_search_path, /* This structure was
-						     allocated and initialized
-						     by the calling
-						     function. It is filled up
-						     by this function.  */
-		   int n_stop_level /* How far down the tree to search. To
-                                       stop at leaf level - set to
-                                       DISK_LEAF_NODE_LEVEL */
-    ) {
-    int  n_block_number;
-    int  expected_level;
-    struct buffer_head  *       p_s_bh;
-    struct path_element *       p_s_last_element;
-    int				n_node_level, n_retval;
-    int 			right_neighbor_of_leaf_node;
-    int				fs_gen;
-    struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
-    unsigned long      reada_blocks[SEARCH_BY_KEY_READA];
-    int reada_count = 0;
+int search_by_key(struct super_block *p_s_sb, const struct cpu_key *p_s_key,	/* Key to search. */
+		  struct path *p_s_search_path,	/* This structure was
+						   allocated and initialized
+						   by the calling
+						   function. It is filled up
+						   by this function.  */
+		  int n_stop_level	/* How far down the tree to search. To
+					   stop at leaf level - set to
+					   DISK_LEAF_NODE_LEVEL */
+    )
+{
+	int n_block_number;
+	int expected_level;
+	struct buffer_head *p_s_bh;
+	struct path_element *p_s_last_element;
+	int n_node_level, n_retval;
+	int right_neighbor_of_leaf_node;
+	int fs_gen;
+	struct buffer_head *reada_bh[SEARCH_BY_KEY_READA];
+	unsigned long reada_blocks[SEARCH_BY_KEY_READA];
+	int reada_count = 0;
 
 #ifdef CONFIG_REISERFS_CHECK
-    int n_repeat_counter = 0;
+	int n_repeat_counter = 0;
 #endif
-    
-    PROC_INFO_INC( p_s_sb, search_by_key );
-    
-    /* As we add each node to a path we increase its count.  This means that
-       we must be careful to release all nodes in a path before we either
-       discard the path struct or re-use the path struct, as we do here. */
 
-    decrement_counters_in_path(p_s_search_path);
+	PROC_INFO_INC(p_s_sb, search_by_key);
+
+	/* As we add each node to a path we increase its count.  This means that
+	   we must be careful to release all nodes in a path before we either
+	   discard the path struct or re-use the path struct, as we do here. */
 
-    right_neighbor_of_leaf_node = 0;
+	decrement_counters_in_path(p_s_search_path);
 
-    /* With each iteration of this loop we search through the items in the
-       current node, and calculate the next current node(next path element)
-       for the next iteration of this loop.. */
-    n_block_number = SB_ROOT_BLOCK (p_s_sb);
-    expected_level = -1;
-    while ( 1 ) {
+	right_neighbor_of_leaf_node = 0;
+
+	/* With each iteration of this loop we search through the items in the
+	   current node, and calculate the next current node(next path element)
+	   for the next iteration of this loop.. */
+	n_block_number = SB_ROOT_BLOCK(p_s_sb);
+	expected_level = -1;
+	while (1) {
 
 #ifdef CONFIG_REISERFS_CHECK
-	if ( !(++n_repeat_counter % 50000) )
-	    reiserfs_warning (p_s_sb, "PAP-5100: search_by_key: %s:"
-			      "there were %d iterations of while loop "
-			      "looking for key %K",
-			      current->comm, n_repeat_counter, p_s_key);
+		if (!(++n_repeat_counter % 50000))
+			reiserfs_warning(p_s_sb, "PAP-5100: search_by_key: %s:"
+					 "there were %d iterations of while loop "
+					 "looking for key %K",
+					 current->comm, n_repeat_counter,
+					 p_s_key);
 #endif
 
-	/* prep path to have another element added to it. */
-	p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length);
-	fs_gen = get_generation (p_s_sb);
-
-	/* Read the next tree node, and set the last element in the path to
-           have a pointer to it. */
-	if ((p_s_bh = p_s_last_element->pe_buffer =
-	     sb_getblk(p_s_sb, n_block_number)) ) {
-	    if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
-		search_by_key_reada (p_s_sb, reada_bh,
-		                     reada_blocks, reada_count);
-	    }
-	    ll_rw_block(READ, 1, &p_s_bh);
-	    wait_on_buffer(p_s_bh);
-	    if (!buffer_uptodate(p_s_bh))
-	        goto io_error;
-	} else {
-io_error:
-	    p_s_search_path->path_length --;
-	    pathrelse(p_s_search_path);
-	    return IO_ERROR;
-	}
-	reada_count = 0;
-	if (expected_level == -1)
-		expected_level = SB_TREE_HEIGHT (p_s_sb);
-	expected_level --;
-
-	/* It is possible that schedule occurred. We must check whether the key
-	   to search is still in the tree rooted from the current buffer. If
-	   not then repeat search from the root. */
-	if ( fs_changed (fs_gen, p_s_sb) && 
-	    (!B_IS_IN_TREE (p_s_bh) ||
-	     B_LEVEL(p_s_bh) != expected_level ||
-	     !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
-	    PROC_INFO_INC( p_s_sb, search_by_key_fs_changed );
-	    PROC_INFO_INC( p_s_sb, search_by_key_restarted );
-	    PROC_INFO_INC( p_s_sb, sbk_restarted[ expected_level - 1 ] );
-	    decrement_counters_in_path(p_s_search_path);
-	    
-	    /* Get the root block number so that we can repeat the search
-	       starting from the root. */
-	    n_block_number = SB_ROOT_BLOCK (p_s_sb);
-	    expected_level = -1;
-	    right_neighbor_of_leaf_node = 0;
-	    
-	    /* repeat search from the root */
-	    continue;
-	}
+		/* prep path to have another element added to it. */
+		p_s_last_element =
+		    PATH_OFFSET_PELEMENT(p_s_search_path,
+					 ++p_s_search_path->path_length);
+		fs_gen = get_generation(p_s_sb);
+
+		/* Read the next tree node, and set the last element in the path to
+		   have a pointer to it. */
+		if ((p_s_bh = p_s_last_element->pe_buffer =
+		     sb_getblk(p_s_sb, n_block_number))) {
+			if (!buffer_uptodate(p_s_bh) && reada_count > 1) {
+				search_by_key_reada(p_s_sb, reada_bh,
+						    reada_blocks, reada_count);
+			}
+			ll_rw_block(READ, 1, &p_s_bh);
+			wait_on_buffer(p_s_bh);
+			if (!buffer_uptodate(p_s_bh))
+				goto io_error;
+		} else {
+		      io_error:
+			p_s_search_path->path_length--;
+			pathrelse(p_s_search_path);
+			return IO_ERROR;
+		}
+		reada_count = 0;
+		if (expected_level == -1)
+			expected_level = SB_TREE_HEIGHT(p_s_sb);
+		expected_level--;
+
+		/* It is possible that schedule occurred. We must check whether the key
+		   to search is still in the tree rooted from the current buffer. If
+		   not then repeat search from the root. */
+		if (fs_changed(fs_gen, p_s_sb) &&
+		    (!B_IS_IN_TREE(p_s_bh) ||
+		     B_LEVEL(p_s_bh) != expected_level ||
+		     !key_in_buffer(p_s_search_path, p_s_key, p_s_sb))) {
+			PROC_INFO_INC(p_s_sb, search_by_key_fs_changed);
+			PROC_INFO_INC(p_s_sb, search_by_key_restarted);
+			PROC_INFO_INC(p_s_sb,
+				      sbk_restarted[expected_level - 1]);
+			decrement_counters_in_path(p_s_search_path);
+
+			/* Get the root block number so that we can repeat the search
+			   starting from the root. */
+			n_block_number = SB_ROOT_BLOCK(p_s_sb);
+			expected_level = -1;
+			right_neighbor_of_leaf_node = 0;
+
+			/* repeat search from the root */
+			continue;
+		}
 
-        /* only check that the key is in the buffer if p_s_key is not
-           equal to the MAX_KEY. Latter case is only possible in
-           "finish_unfinished()" processing during mount. */
-        RFALSE( comp_keys( &MAX_KEY, p_s_key ) &&
-                ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
-		"PAP-5130: key is not in the buffer");
+		/* only check that the key is in the buffer if p_s_key is not
+		   equal to the MAX_KEY. Latter case is only possible in
+		   "finish_unfinished()" processing during mount. */
+		RFALSE(comp_keys(&MAX_KEY, p_s_key) &&
+		       !key_in_buffer(p_s_search_path, p_s_key, p_s_sb),
+		       "PAP-5130: key is not in the buffer");
 #ifdef CONFIG_REISERFS_CHECK
-	if ( cur_tb ) {
-	    print_cur_tb ("5140");
-	    reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!");
-	}
+		if (cur_tb) {
+			print_cur_tb("5140");
+			reiserfs_panic(p_s_sb,
+				       "PAP-5140: search_by_key: schedule occurred in do_balance!");
+		}
 #endif
 
-	// make sure, that the node contents look like a node of
-	// certain level
-	if (!is_tree_node (p_s_bh, expected_level)) {
-	    reiserfs_warning (p_s_sb, "vs-5150: search_by_key: "
-			      "invalid format found in block %ld. Fsck?",
-			      p_s_bh->b_blocknr);
-	    pathrelse (p_s_search_path);
-	    return IO_ERROR;
-	}
-	
-	/* ok, we have acquired next formatted node in the tree */
-	n_node_level = B_LEVEL (p_s_bh);
-
-	PROC_INFO_BH_STAT( p_s_sb, p_s_bh, n_node_level - 1 );
-
-	RFALSE( n_node_level < n_stop_level,
-		"vs-5152: tree level (%d) is less than stop level (%d)",
-		n_node_level, n_stop_level);
-
-	n_retval = bin_search( p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
-                B_NR_ITEMS(p_s_bh),
-                ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE,
-                &(p_s_last_element->pe_position));
-	if (n_node_level == n_stop_level) {
-	    return n_retval;
-	}
+		// make sure, that the node contents look like a node of
+		// certain level
+		if (!is_tree_node(p_s_bh, expected_level)) {
+			reiserfs_warning(p_s_sb, "vs-5150: search_by_key: "
+					 "invalid format found in block %ld. Fsck?",
+					 p_s_bh->b_blocknr);
+			pathrelse(p_s_search_path);
+			return IO_ERROR;
+		}
 
-	/* we are not in the stop level */
-	if (n_retval == ITEM_FOUND)
-	    /* item has been found, so we choose the pointer which is to the right of the found one */
-	    p_s_last_element->pe_position++;
+		/* ok, we have acquired next formatted node in the tree */
+		n_node_level = B_LEVEL(p_s_bh);
 
-	/* if item was not found we choose the position which is to
-	   the left of the found item. This requires no code,
-	   bin_search did it already.*/
+		PROC_INFO_BH_STAT(p_s_sb, p_s_bh, n_node_level - 1);
 
-	/* So we have chosen a position in the current node which is
-	   an internal node.  Now we calculate child block number by
-	   position in the node. */
-	n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
+		RFALSE(n_node_level < n_stop_level,
+		       "vs-5152: tree level (%d) is less than stop level (%d)",
+		       n_node_level, n_stop_level);
 
-	/* if we are going to read leaf nodes, try for read ahead as well */
-	if ((p_s_search_path->reada & PATH_READA) &&
-	    n_node_level == DISK_LEAF_NODE_LEVEL + 1)
-	{
-	    int pos = p_s_last_element->pe_position;
-	    int limit = B_NR_ITEMS(p_s_bh);
-	    struct reiserfs_key *le_key;
-
-	    if (p_s_search_path->reada & PATH_READA_BACK)
-		limit = 0;
-	    while(reada_count < SEARCH_BY_KEY_READA) {
-		if (pos == limit)
-		    break;
-	        reada_blocks[reada_count++] = B_N_CHILD_NUM(p_s_bh, pos);
-		if (p_s_search_path->reada & PATH_READA_BACK)
-		    pos--;
-		else
-		    pos++;
+		n_retval = bin_search(p_s_key, B_N_PITEM_HEAD(p_s_bh, 0),
+				      B_NR_ITEMS(p_s_bh),
+				      (n_node_level ==
+				       DISK_LEAF_NODE_LEVEL) ? IH_SIZE :
+				      KEY_SIZE,
+				      &(p_s_last_element->pe_position));
+		if (n_node_level == n_stop_level) {
+			return n_retval;
+		}
 
-		/*
-		 * check to make sure we're in the same object
-		 */
-		le_key = B_N_PDELIM_KEY(p_s_bh, pos);
-		if (le32_to_cpu(le_key->k_objectid) !=
-		    p_s_key->on_disk_key.k_objectid)
-		{
-		    break;
+		/* we are not in the stop level */
+		if (n_retval == ITEM_FOUND)
+			/* item has been found, so we choose the pointer which is to the right of the found one */
+			p_s_last_element->pe_position++;
+
+		/* if item was not found we choose the position which is to
+		   the left of the found item. This requires no code,
+		   bin_search did it already. */
+
+		/* So we have chosen a position in the current node which is
+		   an internal node.  Now we calculate child block number by
+		   position in the node. */
+		n_block_number =
+		    B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position);
+
+		/* if we are going to read leaf nodes, try for read ahead as well */
+		if ((p_s_search_path->reada & PATH_READA) &&
+		    n_node_level == DISK_LEAF_NODE_LEVEL + 1) {
+			int pos = p_s_last_element->pe_position;
+			int limit = B_NR_ITEMS(p_s_bh);
+			struct reiserfs_key *le_key;
+
+			if (p_s_search_path->reada & PATH_READA_BACK)
+				limit = 0;
+			while (reada_count < SEARCH_BY_KEY_READA) {
+				if (pos == limit)
+					break;
+				reada_blocks[reada_count++] =
+				    B_N_CHILD_NUM(p_s_bh, pos);
+				if (p_s_search_path->reada & PATH_READA_BACK)
+					pos--;
+				else
+					pos++;
+
+				/*
+				 * check to make sure we're in the same object
+				 */
+				le_key = B_N_PDELIM_KEY(p_s_bh, pos);
+				if (le32_to_cpu(le_key->k_objectid) !=
+				    p_s_key->on_disk_key.k_objectid) {
+					break;
+				}
+			}
 		}
-	    }
-        }
-    }
+	}
 }
 
-
 /* Form the path to an item and position in this item which contains
    file byte defined by p_s_key. If there is no such item
    corresponding to the key, we point the path to the item with
@@ -780,94 +812,97 @@ io_error:
    units of directory entries.  */
 
 /* The function is NOT SCHEDULE-SAFE! */
-int search_for_position_by_key (struct super_block  * p_s_sb,         /* Pointer to the super block.          */
-				const struct cpu_key  * p_cpu_key,      /* Key to search (cpu variable)         */
-				struct path         * p_s_search_path /* Filled up by this function.          */
-    ) {
-    struct item_head    * p_le_ih; /* pointer to on-disk structure */
-    int                   n_blk_size;
-    loff_t item_offset, offset;
-    struct reiserfs_dir_entry de;
-    int retval;
-
-    /* If searching for directory entry. */
-    if ( is_direntry_cpu_key (p_cpu_key) )
-	return  search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de);
-
-    /* If not searching for directory entry. */
-    
-    /* If item is found. */
-    retval = search_item (p_s_sb, p_cpu_key, p_s_search_path);
-    if (retval == IO_ERROR)
-	return retval;
-    if ( retval == ITEM_FOUND )  {
-
-	RFALSE( ! ih_item_len(
-                B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
-			       PATH_LAST_POSITION(p_s_search_path))),
-	        "PAP-5165: item length equals zero");
+int search_for_position_by_key(struct super_block *p_s_sb,	/* Pointer to the super block.          */
+			       const struct cpu_key *p_cpu_key,	/* Key to search (cpu variable)         */
+			       struct path *p_s_search_path	/* Filled up by this function.          */
+    )
+{
+	struct item_head *p_le_ih;	/* pointer to on-disk structure */
+	int n_blk_size;
+	loff_t item_offset, offset;
+	struct reiserfs_dir_entry de;
+	int retval;
+
+	/* If searching for directory entry. */
+	if (is_direntry_cpu_key(p_cpu_key))
+		return search_by_entry_key(p_s_sb, p_cpu_key, p_s_search_path,
+					   &de);
+
+	/* If not searching for directory entry. */
+
+	/* If item is found. */
+	retval = search_item(p_s_sb, p_cpu_key, p_s_search_path);
+	if (retval == IO_ERROR)
+		return retval;
+	if (retval == ITEM_FOUND) {
 
-	pos_in_item(p_s_search_path) = 0;
-	return POSITION_FOUND;
-    }
+		RFALSE(!ih_item_len
+		       (B_N_PITEM_HEAD
+			(PATH_PLAST_BUFFER(p_s_search_path),
+			 PATH_LAST_POSITION(p_s_search_path))),
+		       "PAP-5165: item length equals zero");
 
-    RFALSE( ! PATH_LAST_POSITION(p_s_search_path),
-	    "PAP-5170: position equals zero");
+		pos_in_item(p_s_search_path) = 0;
+		return POSITION_FOUND;
+	}
 
-    /* Item is not found. Set path to the previous item. */
-    p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path));
-    n_blk_size = p_s_sb->s_blocksize;
+	RFALSE(!PATH_LAST_POSITION(p_s_search_path),
+	       "PAP-5170: position equals zero");
 
-    if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) {
-	return FILE_NOT_FOUND;
-    }
+	/* Item is not found. Set path to the previous item. */
+	p_le_ih =
+	    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path),
+			   --PATH_LAST_POSITION(p_s_search_path));
+	n_blk_size = p_s_sb->s_blocksize;
 
-    // FIXME: quite ugly this far
+	if (comp_short_keys(&(p_le_ih->ih_key), p_cpu_key)) {
+		return FILE_NOT_FOUND;
+	}
+	// FIXME: quite ugly this far
 
-    item_offset = le_ih_k_offset (p_le_ih);
-    offset = cpu_key_k_offset (p_cpu_key);
+	item_offset = le_ih_k_offset(p_le_ih);
+	offset = cpu_key_k_offset(p_cpu_key);
 
-    /* Needed byte is contained in the item pointed to by the path.*/
-    if (item_offset <= offset &&
-	item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) {
-	pos_in_item (p_s_search_path) = offset - item_offset;
-	if ( is_indirect_le_ih(p_le_ih) ) {
-	    pos_in_item (p_s_search_path) /= n_blk_size;
+	/* Needed byte is contained in the item pointed to by the path. */
+	if (item_offset <= offset &&
+	    item_offset + op_bytes_number(p_le_ih, n_blk_size) > offset) {
+		pos_in_item(p_s_search_path) = offset - item_offset;
+		if (is_indirect_le_ih(p_le_ih)) {
+			pos_in_item(p_s_search_path) /= n_blk_size;
+		}
+		return POSITION_FOUND;
 	}
-	return POSITION_FOUND;
-    }
-
-    /* Needed byte is not contained in the item pointed to by the
-     path. Set pos_in_item out of the item. */
-    if ( is_indirect_le_ih (p_le_ih) )
-	pos_in_item (p_s_search_path) = ih_item_len(p_le_ih) / UNFM_P_SIZE;
-    else
-        pos_in_item (p_s_search_path) = ih_item_len( p_le_ih );
-  
-    return POSITION_NOT_FOUND;
-}
 
+	/* Needed byte is not contained in the item pointed to by the
+	   path. Set pos_in_item out of the item. */
+	if (is_indirect_le_ih(p_le_ih))
+		pos_in_item(p_s_search_path) =
+		    ih_item_len(p_le_ih) / UNFM_P_SIZE;
+	else
+		pos_in_item(p_s_search_path) = ih_item_len(p_le_ih);
+
+	return POSITION_NOT_FOUND;
+}
 
 /* Compare given item and item pointed to by the path. */
-int comp_items (const struct item_head * stored_ih, const struct path * p_s_path)
+int comp_items(const struct item_head *stored_ih, const struct path *p_s_path)
 {
-    struct buffer_head  * p_s_bh;
-    struct item_head    * ih;
+	struct buffer_head *p_s_bh;
+	struct item_head *ih;
 
-    /* Last buffer at the path is not in the tree. */
-    if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) )
-	return 1;
+	/* Last buffer at the path is not in the tree. */
+	if (!B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)))
+		return 1;
 
-    /* Last path position is invalid. */
-    if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) )
-	return 1;
+	/* Last path position is invalid. */
+	if (PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh))
+		return 1;
 
-    /* we need only to know, whether it is the same item */
-    ih = get_ih (p_s_path);
-    return memcmp (stored_ih, ih, IH_SIZE);
+	/* we need only to know, whether it is the same item */
+	ih = get_ih(p_s_path);
+	return memcmp(stored_ih, ih, IH_SIZE);
 }
 
-
 /* unformatted nodes are not logged anymore, ever.  This is safe
 ** now
 */
@@ -876,461 +911,466 @@ int comp_items (const struct item_head * stored_ih, const struct path * p_s_path
 // block can not be forgotten as it is in I/O or held by someone
 #define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh)))
 
-
-
 // prepare for delete or cut of direct item
-static inline int prepare_for_direct_item (struct path * path,
-					   struct item_head * le_ih,
-					   struct inode * inode,
-					   loff_t new_file_length,
-					   int * cut_size)
+static inline int prepare_for_direct_item(struct path *path,
+					  struct item_head *le_ih,
+					  struct inode *inode,
+					  loff_t new_file_length, int *cut_size)
 {
-    loff_t round_len;
-
-
-    if ( new_file_length == max_reiserfs_offset (inode) ) {
-	/* item has to be deleted */
-	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-	return M_DELETE;
-    }
-	
-    // new file gets truncated
-    if (get_inode_item_key_version (inode) == KEY_FORMAT_3_6) {
-	// 
-	round_len = ROUND_UP (new_file_length); 
-	/* this was n_new_file_length < le_ih ... */
-	if ( round_len < le_ih_k_offset (le_ih) )  {
-	    *cut_size = -(IH_SIZE + ih_item_len(le_ih));
-	    return M_DELETE; /* Delete this item. */
+	loff_t round_len;
+
+	if (new_file_length == max_reiserfs_offset(inode)) {
+		/* item has to be deleted */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+	// new file gets truncated
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_6) {
+		// 
+		round_len = ROUND_UP(new_file_length);
+		/* this was n_new_file_length < le_ih ... */
+		if (round_len < le_ih_k_offset(le_ih)) {
+			*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+			return M_DELETE;	/* Delete this item. */
+		}
+		/* Calculate first position and size for cutting from item. */
+		pos_in_item(path) = round_len - (le_ih_k_offset(le_ih) - 1);
+		*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
+
+		return M_CUT;	/* Cut from this item. */
+	}
+
+	// old file: items may have any length
+
+	if (new_file_length < le_ih_k_offset(le_ih)) {
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;	/* Delete this item. */
 	}
 	/* Calculate first position and size for cutting from item. */
-	pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1);
-	*cut_size = -(ih_item_len(le_ih) - pos_in_item(path));
-	
-	return M_CUT; /* Cut from this item. */
-    }
-
-
-    // old file: items may have any length
-
-    if ( new_file_length < le_ih_k_offset (le_ih) )  {
-	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-	return M_DELETE; /* Delete this item. */
-    }
-    /* Calculate first position and size for cutting from item. */
-    *cut_size = -(ih_item_len(le_ih) -
-		      (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih)));
-    return M_CUT; /* Cut from this item. */
+	*cut_size = -(ih_item_len(le_ih) -
+		      (pos_in_item(path) =
+		       new_file_length + 1 - le_ih_k_offset(le_ih)));
+	return M_CUT;		/* Cut from this item. */
 }
 
-
-static inline int prepare_for_direntry_item (struct path * path,
-					     struct item_head * le_ih,
-					     struct inode * inode,
-					     loff_t new_file_length,
-					     int * cut_size)
+static inline int prepare_for_direntry_item(struct path *path,
+					    struct item_head *le_ih,
+					    struct inode *inode,
+					    loff_t new_file_length,
+					    int *cut_size)
 {
-    if (le_ih_k_offset (le_ih) == DOT_OFFSET && 
-	new_file_length == max_reiserfs_offset (inode)) {
-	RFALSE( ih_entry_count (le_ih) != 2,
-	        "PAP-5220: incorrect empty directory item (%h)", le_ih);
-	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-	return M_DELETE; /* Delete the directory item containing "." and ".." entry. */
-    }
-    
-    if ( ih_entry_count (le_ih) == 1 )  {
-	/* Delete the directory item such as there is one record only
-	   in this item*/
-	*cut_size = -(IH_SIZE + ih_item_len(le_ih));
-	return M_DELETE;
-    }
-    
-    /* Cut one record from the directory item. */
-    *cut_size = -(DEH_SIZE + entry_length (get_last_bh (path), le_ih, pos_in_item (path)));
-    return M_CUT; 
-}
+	if (le_ih_k_offset(le_ih) == DOT_OFFSET &&
+	    new_file_length == max_reiserfs_offset(inode)) {
+		RFALSE(ih_entry_count(le_ih) != 2,
+		       "PAP-5220: incorrect empty directory item (%h)", le_ih);
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;	/* Delete the directory item containing "." and ".." entry. */
+	}
 
+	if (ih_entry_count(le_ih) == 1) {
+		/* Delete the directory item such as there is one record only
+		   in this item */
+		*cut_size = -(IH_SIZE + ih_item_len(le_ih));
+		return M_DELETE;
+	}
+
+	/* Cut one record from the directory item. */
+	*cut_size =
+	    -(DEH_SIZE +
+	      entry_length(get_last_bh(path), le_ih, pos_in_item(path)));
+	return M_CUT;
+}
 
 /*  If the path points to a directory or direct item, calculate mode and the size cut, for balance.
     If the path points to an indirect item, remove some number of its unformatted nodes.
     In case of file truncate calculate whether this item must be deleted/truncated or last
     unformatted node of this item will be converted to a direct item.
     This function returns a determination of what balance mode the calling function should employ. */
-static char  prepare_for_delete_or_cut(
-				       struct reiserfs_transaction_handle *th, 
-				       struct inode * inode,
-				       struct path         * p_s_path,
-				       const struct cpu_key      * p_s_item_key,
-				       int                 * p_n_removed,      /* Number of unformatted nodes which were removed
-										  from end of the file. */
-				       int                 * p_n_cut_size,
-				       unsigned long long    n_new_file_length /* MAX_KEY_OFFSET in case of delete. */
-    ) {
-    struct super_block  * p_s_sb = inode->i_sb;
-    struct item_head    * p_le_ih = PATH_PITEM_HEAD(p_s_path);
-    struct buffer_head  * p_s_bh = PATH_PLAST_BUFFER(p_s_path);
-
-    BUG_ON (!th->t_trans_id);
-
-    /* Stat_data item. */
-    if ( is_statdata_le_ih (p_le_ih) ) {
-
-	RFALSE( n_new_file_length != max_reiserfs_offset (inode),
-		"PAP-5210: mode must be M_DELETE");
-
-	*p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
-	return M_DELETE;
-    }
-
-
-    /* Directory item. */
-    if ( is_direntry_le_ih (p_le_ih) )
-	return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
-
-    /* Direct item. */
-    if ( is_direct_le_ih (p_le_ih) )
-	return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size);
-
-
-    /* Case of an indirect item. */
-    {
-	int                   n_unfm_number,    /* Number of the item unformatted nodes. */
-	    n_counter,
-	    n_blk_size;
-	__le32               * p_n_unfm_pointer; /* Pointer to the unformatted node number. */
-	__u32 tmp;
-	struct item_head      s_ih;           /* Item header. */
-	char                  c_mode;           /* Returned mode of the balance. */
-	int need_research;
+static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, struct inode *inode, struct path *p_s_path, const struct cpu_key *p_s_item_key, int *p_n_removed,	/* Number of unformatted nodes which were removed
+																						   from end of the file. */
+				      int *p_n_cut_size, unsigned long long n_new_file_length	/* MAX_KEY_OFFSET in case of delete. */
+    )
+{
+	struct super_block *p_s_sb = inode->i_sb;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_path);
+	struct buffer_head *p_s_bh = PATH_PLAST_BUFFER(p_s_path);
 
+	BUG_ON(!th->t_trans_id);
 
-	n_blk_size = p_s_sb->s_blocksize;
+	/* Stat_data item. */
+	if (is_statdata_le_ih(p_le_ih)) {
 
-	/* Search for the needed object indirect item until there are no unformatted nodes to be removed. */
-	do  {
-	    need_research = 0;
-            p_s_bh = PATH_PLAST_BUFFER(p_s_path);
-	    /* Copy indirect item header to a temp variable. */
-	    copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
-	    /* Calculate number of unformatted nodes in this item. */
-	    n_unfm_number = I_UNFM_NUM(&s_ih);
-
-	    RFALSE( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number ||
-		    pos_in_item (p_s_path) + 1 !=  n_unfm_number,
-		    "PAP-5240: invalid item %h "
-		    "n_unfm_number = %d *p_n_pos_in_item = %d", 
-		    &s_ih, n_unfm_number, pos_in_item (p_s_path));
-
-	    /* Calculate balance mode and position in the item to remove unformatted nodes. */
-	    if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */
-		pos_in_item (p_s_path) = 0;
-		*p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
-		c_mode = M_DELETE;
-	    }
-	    else  { /* Case of truncate. */
-		if ( n_new_file_length < le_ih_k_offset (&s_ih) )  {
-		    pos_in_item (p_s_path) = 0;
-		    *p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
-		    c_mode = M_DELETE; /* Delete this item. */
-		}
-		else  {
-		    /* indirect item must be truncated starting from *p_n_pos_in_item-th position */
-		    pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits;
-
-		    RFALSE( pos_in_item (p_s_path) > n_unfm_number,
-			    "PAP-5250: invalid position in the item");
-
-		    /* Either convert last unformatted node of indirect item to direct item or increase
-		       its free space.  */
-		    if ( pos_in_item (p_s_path) == n_unfm_number )  {
-			*p_n_cut_size = 0; /* Nothing to cut. */
-			return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */
-		    }
-		    /* Calculate size to cut. */
-		    *p_n_cut_size = -(ih_item_len(&s_ih) - pos_in_item(p_s_path) * UNFM_P_SIZE);
-
-		    c_mode = M_CUT;     /* Cut from this indirect item. */
-		}
-	    }
-
-	    RFALSE( n_unfm_number <= pos_in_item (p_s_path),
-		    "PAP-5260: invalid position in the indirect item");
-
-	    /* pointers to be cut */
-	    n_unfm_number -= pos_in_item (p_s_path);
-	    /* Set pointer to the last unformatted node pointer that is to be cut. */
-	    p_n_unfm_pointer = (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed;
-
-
-	    /* We go through the unformatted nodes pointers of the indirect
-	       item and look for the unformatted nodes in the cache. If we
-	       found some of them we free it, zero corresponding indirect item
-	       entry and log buffer containing that indirect item. For this we
-	       need to prepare last path element for logging. If some
-	       unformatted node has b_count > 1 we must not free this
-	       unformatted node since it is in use. */
-	    reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
-	    // note: path could be changed, first line in for loop takes care
-	    // of it
-
-	    for (n_counter = *p_n_removed;
-		 n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) {
-
-		cond_resched();
-		if (item_moved (&s_ih, p_s_path)) {
-		    need_research = 1 ;
-		    break;
-		}
-		RFALSE( p_n_unfm_pointer < (__le32 *)B_I_PITEM(p_s_bh, &s_ih) ||
-			p_n_unfm_pointer > (__le32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1,
-			"vs-5265: pointer out of range");
+		RFALSE(n_new_file_length != max_reiserfs_offset(inode),
+		       "PAP-5210: mode must be M_DELETE");
 
-		/* Hole, nothing to remove. */
-		if ( ! get_block_num(p_n_unfm_pointer,0) )  {
-			(*p_n_removed)++;
-			continue;
-		}
+		*p_n_cut_size = -(IH_SIZE + ih_item_len(p_le_ih));
+		return M_DELETE;
+	}
 
-		(*p_n_removed)++;
+	/* Directory item. */
+	if (is_direntry_le_ih(p_le_ih))
+		return prepare_for_direntry_item(p_s_path, p_le_ih, inode,
+						 n_new_file_length,
+						 p_n_cut_size);
 
-		tmp = get_block_num(p_n_unfm_pointer,0);
-		put_block_num(p_n_unfm_pointer, 0, 0);
-		journal_mark_dirty (th, p_s_sb, p_s_bh);
-		reiserfs_free_block(th, inode, tmp, 1);
-		if ( item_moved (&s_ih, p_s_path) )  {
-			need_research = 1;
-			break ;
-		}
-	    }
-
-	    /* a trick.  If the buffer has been logged, this
-	    ** will do nothing.  If we've broken the loop without
-	    ** logging it, it will restore the buffer
-	    **
-	    */
-	    reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
-
-	    /* This loop can be optimized. */
-	} while ( (*p_n_removed < n_unfm_number || need_research) &&
-		  search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND );
-
-	RFALSE( *p_n_removed < n_unfm_number, 
-		"PAP-5310: indirect item is not found");
-	RFALSE( item_moved (&s_ih, p_s_path), 
-		"after while, comp failed, retry") ;
-
-	if (c_mode == M_CUT)
-	    pos_in_item (p_s_path) *= UNFM_P_SIZE;
-	return c_mode;
-    }
+	/* Direct item. */
+	if (is_direct_le_ih(p_le_ih))
+		return prepare_for_direct_item(p_s_path, p_le_ih, inode,
+					       n_new_file_length, p_n_cut_size);
+
+	/* Case of an indirect item. */
+	{
+		int n_unfm_number,	/* Number of the item unformatted nodes. */
+		 n_counter, n_blk_size;
+		__le32 *p_n_unfm_pointer;	/* Pointer to the unformatted node number. */
+		__u32 tmp;
+		struct item_head s_ih;	/* Item header. */
+		char c_mode;	/* Returned mode of the balance. */
+		int need_research;
+
+		n_blk_size = p_s_sb->s_blocksize;
+
+		/* Search for the needed object indirect item until there are no unformatted nodes to be removed. */
+		do {
+			need_research = 0;
+			p_s_bh = PATH_PLAST_BUFFER(p_s_path);
+			/* Copy indirect item header to a temp variable. */
+			copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+			/* Calculate number of unformatted nodes in this item. */
+			n_unfm_number = I_UNFM_NUM(&s_ih);
+
+			RFALSE(!is_indirect_le_ih(&s_ih) || !n_unfm_number ||
+			       pos_in_item(p_s_path) + 1 != n_unfm_number,
+			       "PAP-5240: invalid item %h "
+			       "n_unfm_number = %d *p_n_pos_in_item = %d",
+			       &s_ih, n_unfm_number, pos_in_item(p_s_path));
+
+			/* Calculate balance mode and position in the item to remove unformatted nodes. */
+			if (n_new_file_length == max_reiserfs_offset(inode)) {	/* Case of delete. */
+				pos_in_item(p_s_path) = 0;
+				*p_n_cut_size = -(IH_SIZE + ih_item_len(&s_ih));
+				c_mode = M_DELETE;
+			} else {	/* Case of truncate. */
+				if (n_new_file_length < le_ih_k_offset(&s_ih)) {
+					pos_in_item(p_s_path) = 0;
+					*p_n_cut_size =
+					    -(IH_SIZE + ih_item_len(&s_ih));
+					c_mode = M_DELETE;	/* Delete this item. */
+				} else {
+					/* indirect item must be truncated starting from *p_n_pos_in_item-th position */
+					pos_in_item(p_s_path) =
+					    (n_new_file_length + n_blk_size -
+					     le_ih_k_offset(&s_ih)) >> p_s_sb->
+					    s_blocksize_bits;
+
+					RFALSE(pos_in_item(p_s_path) >
+					       n_unfm_number,
+					       "PAP-5250: invalid position in the item");
+
+					/* Either convert last unformatted node of indirect item to direct item or increase
+					   its free space.  */
+					if (pos_in_item(p_s_path) ==
+					    n_unfm_number) {
+						*p_n_cut_size = 0;	/* Nothing to cut. */
+						return M_CONVERT;	/* Maybe convert last unformatted node to the direct item. */
+					}
+					/* Calculate size to cut. */
+					*p_n_cut_size =
+					    -(ih_item_len(&s_ih) -
+					      pos_in_item(p_s_path) *
+					      UNFM_P_SIZE);
+
+					c_mode = M_CUT;	/* Cut from this indirect item. */
+				}
+			}
+
+			RFALSE(n_unfm_number <= pos_in_item(p_s_path),
+			       "PAP-5260: invalid position in the indirect item");
+
+			/* pointers to be cut */
+			n_unfm_number -= pos_in_item(p_s_path);
+			/* Set pointer to the last unformatted node pointer that is to be cut. */
+			p_n_unfm_pointer =
+			    (__le32 *) B_I_PITEM(p_s_bh,
+						 &s_ih) + I_UNFM_NUM(&s_ih) -
+			    1 - *p_n_removed;
+
+			/* We go through the unformatted nodes pointers of the indirect
+			   item and look for the unformatted nodes in the cache. If we
+			   found some of them we free it, zero corresponding indirect item
+			   entry and log buffer containing that indirect item. For this we
+			   need to prepare last path element for logging. If some
+			   unformatted node has b_count > 1 we must not free this
+			   unformatted node since it is in use. */
+			reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1);
+			// note: path could be changed, first line in for loop takes care
+			// of it
+
+			for (n_counter = *p_n_removed;
+			     n_counter < n_unfm_number;
+			     n_counter++, p_n_unfm_pointer--) {
+
+				cond_resched();
+				if (item_moved(&s_ih, p_s_path)) {
+					need_research = 1;
+					break;
+				}
+				RFALSE(p_n_unfm_pointer <
+				       (__le32 *) B_I_PITEM(p_s_bh, &s_ih)
+				       || p_n_unfm_pointer >
+				       (__le32 *) B_I_PITEM(p_s_bh,
+							    &s_ih) +
+				       I_UNFM_NUM(&s_ih) - 1,
+				       "vs-5265: pointer out of range");
+
+				/* Hole, nothing to remove. */
+				if (!get_block_num(p_n_unfm_pointer, 0)) {
+					(*p_n_removed)++;
+					continue;
+				}
+
+				(*p_n_removed)++;
+
+				tmp = get_block_num(p_n_unfm_pointer, 0);
+				put_block_num(p_n_unfm_pointer, 0, 0);
+				journal_mark_dirty(th, p_s_sb, p_s_bh);
+				reiserfs_free_block(th, inode, tmp, 1);
+				if (item_moved(&s_ih, p_s_path)) {
+					need_research = 1;
+					break;
+				}
+			}
+
+			/* a trick.  If the buffer has been logged, this
+			 ** will do nothing.  If we've broken the loop without
+			 ** logging it, it will restore the buffer
+			 **
+			 */
+			reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh);
+
+			/* This loop can be optimized. */
+		} while ((*p_n_removed < n_unfm_number || need_research) &&
+			 search_for_position_by_key(p_s_sb, p_s_item_key,
+						    p_s_path) ==
+			 POSITION_FOUND);
+
+		RFALSE(*p_n_removed < n_unfm_number,
+		       "PAP-5310: indirect item is not found");
+		RFALSE(item_moved(&s_ih, p_s_path),
+		       "after while, comp failed, retry");
+
+		if (c_mode == M_CUT)
+			pos_in_item(p_s_path) *= UNFM_P_SIZE;
+		return c_mode;
+	}
 }
 
 /* Calculate number of bytes which will be deleted or cut during balance */
-static int calc_deleted_bytes_number(
-    struct  tree_balance  * p_s_tb,
-    char                    c_mode
-    ) {
-    int                     n_del_size;
-    struct  item_head     * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
-
-    if ( is_statdata_le_ih (p_le_ih) )
-	return 0;
+static int calc_deleted_bytes_number(struct tree_balance *p_s_tb, char c_mode)
+{
+	int n_del_size;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path);
+
+	if (is_statdata_le_ih(p_le_ih))
+		return 0;
+
+	n_del_size =
+	    (c_mode ==
+	     M_DELETE) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
+	if (is_direntry_le_ih(p_le_ih)) {
+		// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
+		// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
+		// empty size.  ick. FIXME, is this right?
+		//
+		return n_del_size;
+	}
 
-    n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0];
-    if ( is_direntry_le_ih (p_le_ih) ) {
-	// return EMPTY_DIR_SIZE; /* We delete emty directoris only. */
-	// we can't use EMPTY_DIR_SIZE, as old format dirs have a different
-	// empty size.  ick. FIXME, is this right?
-	//
-	return n_del_size ;
-    }
-
-    if ( is_indirect_le_ih (p_le_ih) )
-	n_del_size = (n_del_size/UNFM_P_SIZE)*
-	  (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih);
-    return n_del_size;
+	if (is_indirect_le_ih(p_le_ih))
+		n_del_size = (n_del_size / UNFM_P_SIZE) * (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);	// - get_ih_free_space (p_le_ih);
+	return n_del_size;
 }
 
-static void init_tb_struct(
-    struct reiserfs_transaction_handle *th,
-    struct tree_balance * p_s_tb,
-    struct super_block  * p_s_sb,
-    struct path         * p_s_path,
-    int                   n_size
-    ) {
-
-    BUG_ON (!th->t_trans_id);
-
-    memset (p_s_tb,'\0',sizeof(struct tree_balance));
-    p_s_tb->transaction_handle = th ;
-    p_s_tb->tb_sb = p_s_sb;
-    p_s_tb->tb_path = p_s_path;
-    PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
-    PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
-    p_s_tb->insert_size[0] = n_size;
-}
+static void init_tb_struct(struct reiserfs_transaction_handle *th,
+			   struct tree_balance *p_s_tb,
+			   struct super_block *p_s_sb,
+			   struct path *p_s_path, int n_size)
+{
 
+	BUG_ON(!th->t_trans_id);
 
+	memset(p_s_tb, '\0', sizeof(struct tree_balance));
+	p_s_tb->transaction_handle = th;
+	p_s_tb->tb_sb = p_s_sb;
+	p_s_tb->tb_path = p_s_path;
+	PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL;
+	PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0;
+	p_s_tb->insert_size[0] = n_size;
+}
 
-void padd_item (char * item, int total_length, int length)
+void padd_item(char *item, int total_length, int length)
 {
-    int i;
+	int i;
 
-    for (i = total_length; i > length; )
-	item [--i] = 0;
+	for (i = total_length; i > length;)
+		item[--i] = 0;
 }
 
 #ifdef REISERQUOTA_DEBUG
 char key2type(struct reiserfs_key *ih)
 {
-  if (is_direntry_le_key(2, ih))
-    return 'd';
-  if (is_direct_le_key(2, ih))
-    return 'D';
-  if (is_indirect_le_key(2, ih))
-    return 'i';
-  if (is_statdata_le_key(2, ih))
-    return 's';
-  return 'u';
+	if (is_direntry_le_key(2, ih))
+		return 'd';
+	if (is_direct_le_key(2, ih))
+		return 'D';
+	if (is_indirect_le_key(2, ih))
+		return 'i';
+	if (is_statdata_le_key(2, ih))
+		return 's';
+	return 'u';
 }
 
 char head2type(struct item_head *ih)
 {
-  if (is_direntry_le_ih(ih))
-    return 'd';
-  if (is_direct_le_ih(ih))
-    return 'D';
-  if (is_indirect_le_ih(ih))
-    return 'i';
-  if (is_statdata_le_ih(ih))
-    return 's';
-  return 'u';
+	if (is_direntry_le_ih(ih))
+		return 'd';
+	if (is_direct_le_ih(ih))
+		return 'D';
+	if (is_indirect_le_ih(ih))
+		return 'i';
+	if (is_statdata_le_ih(ih))
+		return 's';
+	return 'u';
 }
 #endif
 
 /* Delete object item. */
-int reiserfs_delete_item (struct reiserfs_transaction_handle *th, 
-			  struct path * p_s_path, /* Path to the deleted item. */
-			  const struct cpu_key * p_s_item_key, /* Key to search for the deleted item.  */
-			  struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */
-			  struct buffer_head  * p_s_un_bh)    /* NULL or unformatted node pointer.    */
-{
-    struct super_block * p_s_sb = p_s_inode->i_sb;
-    struct tree_balance   s_del_balance;
-    struct item_head      s_ih;
-    struct item_head      *q_ih;
-    int			  quota_cut_bytes;
-    int                   n_ret_value,
-	n_del_size,
-	n_removed;
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th, struct path *p_s_path,	/* Path to the deleted item. */
+			 const struct cpu_key *p_s_item_key,	/* Key to search for the deleted item.  */
+			 struct inode *p_s_inode,	/* inode is here just to update i_blocks and quotas */
+			 struct buffer_head *p_s_un_bh)
+{				/* NULL or unformatted node pointer.    */
+	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct tree_balance s_del_balance;
+	struct item_head s_ih;
+	struct item_head *q_ih;
+	int quota_cut_bytes;
+	int n_ret_value, n_del_size, n_removed;
 
 #ifdef CONFIG_REISERFS_CHECK
-    char                  c_mode;
-    int			n_iter = 0;
+	char c_mode;
+	int n_iter = 0;
 #endif
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/);
+	init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path,
+		       0 /*size is unknown */ );
 
-    while ( 1 ) {
-	n_removed = 0;
+	while (1) {
+		n_removed = 0;
 
 #ifdef CONFIG_REISERFS_CHECK
-	n_iter++;
-	c_mode =
+		n_iter++;
+		c_mode =
 #endif
-	    prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode));
-
-	RFALSE( c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
-
-	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
-	s_del_balance.insert_size[0] = n_del_size;
-
-	n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
-	if ( n_ret_value != REPEAT_SEARCH )
-	    break;
-
-	PROC_INFO_INC( p_s_sb, delete_item_restarted );
+		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
+					      p_s_item_key, &n_removed,
+					      &n_del_size,
+					      max_reiserfs_offset(p_s_inode));
+
+		RFALSE(c_mode != M_DELETE, "PAP-5320: mode must be M_DELETE");
+
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+		s_del_balance.insert_size[0] = n_del_size;
+
+		n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, NULL);
+		if (n_ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(p_s_sb, delete_item_restarted);
+
+		// file system changed, repeat search
+		n_ret_value =
+		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
+		if (n_ret_value == IO_ERROR)
+			break;
+		if (n_ret_value == FILE_NOT_FOUND) {
+			reiserfs_warning(p_s_sb,
+					 "vs-5340: reiserfs_delete_item: "
+					 "no items of the file %K found",
+					 p_s_item_key);
+			break;
+		}
+	}			/* while (1) */
 
-	// file system changed, repeat search
-	n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
-	if (n_ret_value == IO_ERROR)
-	    break;
-	if (n_ret_value == FILE_NOT_FOUND) {
-	    reiserfs_warning (p_s_sb, "vs-5340: reiserfs_delete_item: "
-			      "no items of the file %K found", p_s_item_key);
-	    break;
+	if (n_ret_value != CARRY_ON) {
+		unfix_nodes(&s_del_balance);
+		return 0;
+	}
+	// reiserfs_delete_item returns item length when success
+	n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
+	q_ih = get_ih(p_s_path);
+	quota_cut_bytes = ih_item_len(q_ih);
+
+	/* hack so the quota code doesn't have to guess if the file
+	 ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
+	 ** We test the offset because the tail might have been
+	 ** split into multiple items, and we only want to decrement for
+	 ** the unfm node once
+	 */
+	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
+		if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
+			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
 	}
-    } /* while (1) */
 
-    if ( n_ret_value != CARRY_ON ) {
-	unfix_nodes(&s_del_balance);
-	return 0;
-    }
-
-    // reiserfs_delete_item returns item length when success
-    n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE);
-    q_ih = get_ih(p_s_path) ;
-    quota_cut_bytes = ih_item_len(q_ih) ;
-
-    /* hack so the quota code doesn't have to guess if the file
-    ** has a tail.  On tail insert, we allocate quota for 1 unformatted node.
-    ** We test the offset because the tail might have been
-    ** split into multiple items, and we only want to decrement for
-    ** the unfm node once
-    */
-    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) {
-        if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) {
-            quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
-        } else {
-	    quota_cut_bytes = 0 ;
+	if (p_s_un_bh) {
+		int off;
+		char *data;
+
+		/* We are in direct2indirect conversion, so move tail contents
+		   to the unformatted node */
+		/* note, we do the copy before preparing the buffer because we
+		 ** don't care about the contents of the unformatted node yet.
+		 ** the only thing we really care about is the direct item's data
+		 ** is in the unformatted node.
+		 **
+		 ** Otherwise, we would have to call reiserfs_prepare_for_journal on
+		 ** the unformatted node, which might schedule, meaning we'd have to
+		 ** loop all the way back up to the start of the while loop.
+		 **
+		 ** The unformatted node must be dirtied later on.  We can't be
+		 ** sure here if the entire tail has been deleted yet.
+		 **
+		 ** p_s_un_bh is from the page cache (all unformatted nodes are
+		 ** from the page cache) and might be a highmem page.  So, we
+		 ** can't use p_s_un_bh->b_data.
+		 ** -clm
+		 */
+
+		data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
+		off = ((le_ih_k_offset(&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
+		memcpy(data + off,
+		       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih),
+		       n_ret_value);
+		kunmap_atomic(data, KM_USER0);
 	}
-    }
-
-    if ( p_s_un_bh )  {
-	int off;
-        char *data ;
-
-	/* We are in direct2indirect conversion, so move tail contents
-           to the unformatted node */
-	/* note, we do the copy before preparing the buffer because we
-	** don't care about the contents of the unformatted node yet.
-	** the only thing we really care about is the direct item's data
-	** is in the unformatted node.
-	**
-	** Otherwise, we would have to call reiserfs_prepare_for_journal on
-	** the unformatted node, which might schedule, meaning we'd have to
-	** loop all the way back up to the start of the while loop.
-	**
-	** The unformatted node must be dirtied later on.  We can't be
-	** sure here if the entire tail has been deleted yet.
-        **
-        ** p_s_un_bh is from the page cache (all unformatted nodes are
-        ** from the page cache) and might be a highmem page.  So, we
-        ** can't use p_s_un_bh->b_data.
-	** -clm
-	*/
-
-        data = kmap_atomic(p_s_un_bh->b_page, KM_USER0);
-	off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1));
-	memcpy(data + off,
-	       B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value);
-	kunmap_atomic(data, KM_USER0);
-    }
-    /* Perform balancing after all resources have been collected at once. */ 
-    do_balance(&s_del_balance, NULL, NULL, M_DELETE);
+	/* Perform balancing after all resources have been collected at once. */
+	do_balance(&s_del_balance, NULL, NULL, M_DELETE);
 
 #ifdef REISERQUOTA_DEBUG
-    reiserfs_debug (p_s_sb, REISERFS_DEBUG_CODE, "reiserquota delete_item(): freeing %u, id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
+	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota delete_item(): freeing %u, id=%u type=%c",
+		       quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih));
 #endif
-    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
 
-    /* Return deleted body length */
-    return n_ret_value;
+	/* Return deleted body length */
+	return n_ret_value;
 }
 
-
 /* Summary Of Mechanisms For Handling Collisions Between Processes:
 
  deletion of the body of the object is performed by iput(), with the
@@ -1347,727 +1387,804 @@ int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
  - Hans
 */
 
-
 /* this deletes item which never gets split */
-void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
-				 struct inode *inode,
-				 struct reiserfs_key * key)
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct reiserfs_key *key)
 {
-    struct tree_balance tb;
-    INITIALIZE_PATH (path);
-    int item_len = 0;
-    int tb_init = 0 ;
-    struct cpu_key cpu_key;
-    int retval;
-    int quota_cut_bytes = 0;
-
-    BUG_ON (!th->t_trans_id);
-    
-    le_key2cpu_key (&cpu_key, key);
-    
-    while (1) {
-	retval = search_item (th->t_super, &cpu_key, &path);
-	if (retval == IO_ERROR) {
-	    reiserfs_warning (th->t_super,
-			      "vs-5350: reiserfs_delete_solid_item: "
-			      "i/o failure occurred trying to delete %K",
-			      &cpu_key);
-	    break;
-	}
-	if (retval != ITEM_FOUND) {
-	    pathrelse (&path);
-	    // No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
-	    if ( !( (unsigned long long) GET_HASH_VALUE (le_key_k_offset (le_key_version (key), key)) == 0 && \
-		 (unsigned long long) GET_GENERATION_NUMBER (le_key_k_offset (le_key_version (key), key)) == 1 ) )
-		reiserfs_warning (th->t_super, "vs-5355: reiserfs_delete_solid_item: %k not found", key);
-	    break;
-	}
-	if (!tb_init) {
-	    tb_init = 1 ;
-	    item_len = ih_item_len( PATH_PITEM_HEAD(&path) );
-	    init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len));
-	}
-	quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ;
+	struct tree_balance tb;
+	INITIALIZE_PATH(path);
+	int item_len = 0;
+	int tb_init = 0;
+	struct cpu_key cpu_key;
+	int retval;
+	int quota_cut_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	le_key2cpu_key(&cpu_key, key);
+
+	while (1) {
+		retval = search_item(th->t_super, &cpu_key, &path);
+		if (retval == IO_ERROR) {
+			reiserfs_warning(th->t_super,
+					 "vs-5350: reiserfs_delete_solid_item: "
+					 "i/o failure occurred trying to delete %K",
+					 &cpu_key);
+			break;
+		}
+		if (retval != ITEM_FOUND) {
+			pathrelse(&path);
+			// No need for a warning, if there is just no free space to insert '..' item into the newly-created subdir
+			if (!
+			    ((unsigned long long)
+			     GET_HASH_VALUE(le_key_k_offset
+					    (le_key_version(key), key)) == 0
+			     && (unsigned long long)
+			     GET_GENERATION_NUMBER(le_key_k_offset
+						   (le_key_version(key),
+						    key)) == 1))
+				reiserfs_warning(th->t_super,
+						 "vs-5355: reiserfs_delete_solid_item: %k not found",
+						 key);
+			break;
+		}
+		if (!tb_init) {
+			tb_init = 1;
+			item_len = ih_item_len(PATH_PITEM_HEAD(&path));
+			init_tb_struct(th, &tb, th->t_super, &path,
+				       -(IH_SIZE + item_len));
+		}
+		quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path));
 
-	retval = fix_nodes (M_DELETE, &tb, NULL, NULL);
-	if (retval == REPEAT_SEARCH) {
-	    PROC_INFO_INC( th -> t_super, delete_solid_item_restarted );
-	    continue;
-	}
+		retval = fix_nodes(M_DELETE, &tb, NULL, NULL);
+		if (retval == REPEAT_SEARCH) {
+			PROC_INFO_INC(th->t_super, delete_solid_item_restarted);
+			continue;
+		}
 
-	if (retval == CARRY_ON) {
-	    do_balance (&tb, NULL, NULL, M_DELETE);
-	    if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
+		if (retval == CARRY_ON) {
+			do_balance(&tb, NULL, NULL, M_DELETE);
+			if (inode) {	/* Should we count quota for item? (we don't count quotas for save-links) */
 #ifdef REISERQUOTA_DEBUG
-		reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota delete_solid_item(): freeing %u id=%u type=%c", quota_cut_bytes, inode->i_uid, key2type(key));
+				reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+					       "reiserquota delete_solid_item(): freeing %u id=%u type=%c",
+					       quota_cut_bytes, inode->i_uid,
+					       key2type(key));
 #endif
-		DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes);
-	    }
-	    break;
+				DQUOT_FREE_SPACE_NODIRTY(inode,
+							 quota_cut_bytes);
+			}
+			break;
+		}
+		// IO_ERROR, NO_DISK_SPACE, etc
+		reiserfs_warning(th->t_super,
+				 "vs-5360: reiserfs_delete_solid_item: "
+				 "could not delete %K due to fix_nodes failure",
+				 &cpu_key);
+		unfix_nodes(&tb);
+		break;
 	}
 
-	// IO_ERROR, NO_DISK_SPACE, etc
-	reiserfs_warning (th->t_super, "vs-5360: reiserfs_delete_solid_item: "
-			  "could not delete %K due to fix_nodes failure", &cpu_key);
-	unfix_nodes (&tb);
-	break;
-    }
-
-    reiserfs_check_path(&path) ;
+	reiserfs_check_path(&path);
 }
 
-
-int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode)
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+			   struct inode *inode)
 {
-    int err;
-    inode->i_size = 0;
-    BUG_ON (!th->t_trans_id);
-
-    /* for directory this deletes item containing "." and ".." */
-    err = reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/);
-    if (err)
-        return err;
-    
+	int err;
+	inode->i_size = 0;
+	BUG_ON(!th->t_trans_id);
+
+	/* for directory this deletes item containing "." and ".." */
+	err =
+	    reiserfs_do_truncate(th, inode, NULL, 0 /*no timestamp updates */ );
+	if (err)
+		return err;
+
 #if defined( USE_INODE_GENERATION_COUNTER )
-    if( !old_format_only ( th -> t_super ) )
-      {
-       __le32 *inode_generation;
-       
-       inode_generation = 
-         &REISERFS_SB(th -> t_super) -> s_rs -> s_inode_generation;
-       *inode_generation = cpu_to_le32( le32_to_cpu( *inode_generation ) + 1 );
-      }
+	if (!old_format_only(th->t_super)) {
+		__le32 *inode_generation;
+
+		inode_generation =
+		    &REISERFS_SB(th->t_super)->s_rs->s_inode_generation;
+		*inode_generation =
+		    cpu_to_le32(le32_to_cpu(*inode_generation) + 1);
+	}
 /* USE_INODE_GENERATION_COUNTER */
 #endif
-    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
 
-    return err;
+	return err;
 }
 
-static void
-unmap_buffers(struct page *page, loff_t pos) {
-    struct buffer_head *bh ;
-    struct buffer_head *head ;
-    struct buffer_head *next ;
-    unsigned long tail_index ;
-    unsigned long cur_index ;
-
-    if (page) {
-	if (page_has_buffers(page)) {
-	    tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
-	    cur_index = 0 ;
-	    head = page_buffers(page) ;
-	    bh = head ;
-	    do {
-		next = bh->b_this_page ;
-
-		/* we want to unmap the buffers that contain the tail, and
-		** all the buffers after it (since the tail must be at the
-		** end of the file).  We don't want to unmap file data
-		** before the tail, since it might be dirty and waiting to
-		** reach disk
-		*/
-		cur_index += bh->b_size ;
-		if (cur_index > tail_index) {
-		    reiserfs_unmap_buffer(bh) ;
+static void unmap_buffers(struct page *page, loff_t pos)
+{
+	struct buffer_head *bh;
+	struct buffer_head *head;
+	struct buffer_head *next;
+	unsigned long tail_index;
+	unsigned long cur_index;
+
+	if (page) {
+		if (page_has_buffers(page)) {
+			tail_index = pos & (PAGE_CACHE_SIZE - 1);
+			cur_index = 0;
+			head = page_buffers(page);
+			bh = head;
+			do {
+				next = bh->b_this_page;
+
+				/* we want to unmap the buffers that contain the tail, and
+				 ** all the buffers after it (since the tail must be at the
+				 ** end of the file).  We don't want to unmap file data
+				 ** before the tail, since it might be dirty and waiting to
+				 ** reach disk
+				 */
+				cur_index += bh->b_size;
+				if (cur_index > tail_index) {
+					reiserfs_unmap_buffer(bh);
+				}
+				bh = next;
+			} while (bh != head);
+			if (PAGE_SIZE == bh->b_size) {
+				clear_page_dirty(page);
+			}
 		}
-		bh = next ;
-	    } while (bh != head) ;
-	    if ( PAGE_SIZE == bh->b_size ) {
-		clear_page_dirty(page);
-	    }
 	}
-    }
 }
 
-static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th, 
-			      struct inode * p_s_inode,
-			      struct page *page, 
-			      struct path         * p_s_path,
-			      const struct cpu_key      * p_s_item_key,
-			      loff_t         n_new_file_size,
-			      char                * p_c_mode
-			      ) {
-    struct super_block * p_s_sb = p_s_inode->i_sb;
-    int n_block_size = p_s_sb->s_blocksize;
-    int cut_bytes;
-    BUG_ON (!th->t_trans_id);
-
-    if (n_new_file_size != p_s_inode->i_size)
-	BUG ();
-
-    /* the page being sent in could be NULL if there was an i/o error
-    ** reading in the last block.  The user will hit problems trying to
-    ** read the file, but for now we just skip the indirect2direct
-    */
-    if (atomic_read(&p_s_inode->i_count) > 1 || 
-        !tail_has_to_be_packed (p_s_inode) || 
-	!page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
-	// leave tail in an unformatted node	
-	*p_c_mode = M_SKIP_BALANCING;
-	cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1));
-	pathrelse(p_s_path);
-	return cut_bytes;
-    }
-    /* Permorm the conversion to a direct_item. */
-    /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/
-    return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);
-}
+static int maybe_indirect_to_direct(struct reiserfs_transaction_handle *th,
+				    struct inode *p_s_inode,
+				    struct page *page,
+				    struct path *p_s_path,
+				    const struct cpu_key *p_s_item_key,
+				    loff_t n_new_file_size, char *p_c_mode)
+{
+	struct super_block *p_s_sb = p_s_inode->i_sb;
+	int n_block_size = p_s_sb->s_blocksize;
+	int cut_bytes;
+	BUG_ON(!th->t_trans_id);
+
+	if (n_new_file_size != p_s_inode->i_size)
+		BUG();
 
+	/* the page being sent in could be NULL if there was an i/o error
+	 ** reading in the last block.  The user will hit problems trying to
+	 ** read the file, but for now we just skip the indirect2direct
+	 */
+	if (atomic_read(&p_s_inode->i_count) > 1 ||
+	    !tail_has_to_be_packed(p_s_inode) ||
+	    !page || (REISERFS_I(p_s_inode)->i_flags & i_nopack_mask)) {
+		// leave tail in an unformatted node    
+		*p_c_mode = M_SKIP_BALANCING;
+		cut_bytes =
+		    n_block_size - (n_new_file_size & (n_block_size - 1));
+		pathrelse(p_s_path);
+		return cut_bytes;
+	}
+	/* Permorm the conversion to a direct_item. */
+	/*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); */
+	return indirect2direct(th, p_s_inode, page, p_s_path, p_s_item_key,
+			       n_new_file_size, p_c_mode);
+}
 
 /* we did indirect_to_direct conversion. And we have inserted direct
    item successesfully, but there were no disk space to cut unfm
    pointer being converted. Therefore we have to delete inserted
    direct item(s) */
-static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path)
+static void indirect_to_direct_roll_back(struct reiserfs_transaction_handle *th,
+					 struct inode *inode, struct path *path)
 {
-    struct cpu_key tail_key;
-    int tail_len;
-    int removed;
-    BUG_ON (!th->t_trans_id);
-
-    make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!!
-    tail_key.key_length = 4;
-
-    tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
-    while (tail_len) {
-	/* look for the last byte of the tail */
-	if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND)
-	    reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item");
-	RFALSE( path->pos_in_item != ih_item_len(PATH_PITEM_HEAD (path)) - 1,
-	        "vs-5616: appended bytes found");
-	PATH_LAST_POSITION (path) --;
-	
-	removed = reiserfs_delete_item (th, path, &tail_key, inode, NULL/*unbh not needed*/);
-	RFALSE( removed <= 0 || removed > tail_len,
-	        "vs-5617: there was tail %d bytes, removed item length %d bytes",
-                tail_len, removed);
-	tail_len -= removed;
-	set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed);
-    }
-    reiserfs_warning (inode->i_sb, "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
-    //mark_file_without_tail (inode);
-    mark_inode_dirty (inode);
+	struct cpu_key tail_key;
+	int tail_len;
+	int removed;
+	BUG_ON(!th->t_trans_id);
+
+	make_cpu_key(&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);	// !!!!
+	tail_key.key_length = 4;
+
+	tail_len =
+	    (cpu_key_k_offset(&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1;
+	while (tail_len) {
+		/* look for the last byte of the tail */
+		if (search_for_position_by_key(inode->i_sb, &tail_key, path) ==
+		    POSITION_NOT_FOUND)
+			reiserfs_panic(inode->i_sb,
+				       "vs-5615: indirect_to_direct_roll_back: found invalid item");
+		RFALSE(path->pos_in_item !=
+		       ih_item_len(PATH_PITEM_HEAD(path)) - 1,
+		       "vs-5616: appended bytes found");
+		PATH_LAST_POSITION(path)--;
+
+		removed =
+		    reiserfs_delete_item(th, path, &tail_key, inode,
+					 NULL /*unbh not needed */ );
+		RFALSE(removed <= 0
+		       || removed > tail_len,
+		       "vs-5617: there was tail %d bytes, removed item length %d bytes",
+		       tail_len, removed);
+		tail_len -= removed;
+		set_cpu_key_k_offset(&tail_key,
+				     cpu_key_k_offset(&tail_key) - removed);
+	}
+	reiserfs_warning(inode->i_sb,
+			 "indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space");
+	//mark_file_without_tail (inode);
+	mark_inode_dirty(inode);
 }
 
-
 /* (Truncate or cut entry) or delete object item. Returns < 0 on failure */
-int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, 
-			    struct path * p_s_path,
-			    struct cpu_key * p_s_item_key,
-			    struct inode * p_s_inode,
-			    struct page *page, 
-			    loff_t n_new_file_size)
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+			   struct path *p_s_path,
+			   struct cpu_key *p_s_item_key,
+			   struct inode *p_s_inode,
+			   struct page *page, loff_t n_new_file_size)
 {
-    struct super_block * p_s_sb = p_s_inode->i_sb;
-    /* Every function which is going to call do_balance must first
-       create a tree_balance structure.  Then it must fill up this
-       structure by using the init_tb_struct and fix_nodes functions.
-       After that we can make tree balancing. */
-    struct tree_balance s_cut_balance;
-    struct item_head *p_le_ih;
-    int n_cut_size = 0,        /* Amount to be cut. */
-	n_ret_value = CARRY_ON,
-	n_removed = 0,     /* Number of the removed unformatted nodes. */
-	n_is_inode_locked = 0;
-    char                c_mode;            /* Mode of the balance. */
-    int retval2 = -1;
-    int quota_cut_bytes;
-    loff_t tail_pos = 0;
-
-    BUG_ON (!th->t_trans_id);
-    
-    init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size);
-
-
-    /* Repeat this loop until we either cut the item without needing
-       to balance, or we fix_nodes without schedule occurring */
-    while ( 1 ) {
-	/* Determine the balance mode, position of the first byte to
-	   be cut, and size to be cut.  In case of the indirect item
-	   free unformatted nodes which are pointed to by the cut
-	   pointers. */
-      
-	c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, 
-					   &n_cut_size, n_new_file_size);
-	if ( c_mode == M_CONVERT )  {
-	    /* convert last unformatted node to direct item or leave
-               tail in the unformatted node */
-	    RFALSE( n_ret_value != CARRY_ON, "PAP-5570: can not convert twice");
-
-	    n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key,
-						    n_new_file_size, &c_mode);
-	    if ( c_mode == M_SKIP_BALANCING )
-		/* tail has been left in the unformatted node */
-		return n_ret_value;
-
-	    n_is_inode_locked = 1;
-	  
-	    /* removing of last unformatted node will change value we
-               have to return to truncate. Save it */
-	    retval2 = n_ret_value;
-	    /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/
-	  
-	    /* So, we have performed the first part of the conversion:
-	       inserting the new direct item.  Now we are removing the
-	       last unformatted node pointer. Set key to search for
-	       it. */
-      	    set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT);
-	    p_s_item_key->key_length = 4;
-	    n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1));
-	    tail_pos = n_new_file_size;
-	    set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1);
-	    if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){
-		print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1);
-		reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)", p_s_item_key);
-	    }
-	    continue;
-	}
-	if (n_cut_size == 0) {
-	    pathrelse (p_s_path);
-	    return 0;
-	}
+	struct super_block *p_s_sb = p_s_inode->i_sb;
+	/* Every function which is going to call do_balance must first
+	   create a tree_balance structure.  Then it must fill up this
+	   structure by using the init_tb_struct and fix_nodes functions.
+	   After that we can make tree balancing. */
+	struct tree_balance s_cut_balance;
+	struct item_head *p_le_ih;
+	int n_cut_size = 0,	/* Amount to be cut. */
+	    n_ret_value = CARRY_ON, n_removed = 0,	/* Number of the removed unformatted nodes. */
+	    n_is_inode_locked = 0;
+	char c_mode;		/* Mode of the balance. */
+	int retval2 = -1;
+	int quota_cut_bytes;
+	loff_t tail_pos = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path,
+		       n_cut_size);
+
+	/* Repeat this loop until we either cut the item without needing
+	   to balance, or we fix_nodes without schedule occurring */
+	while (1) {
+		/* Determine the balance mode, position of the first byte to
+		   be cut, and size to be cut.  In case of the indirect item
+		   free unformatted nodes which are pointed to by the cut
+		   pointers. */
+
+		c_mode =
+		    prepare_for_delete_or_cut(th, p_s_inode, p_s_path,
+					      p_s_item_key, &n_removed,
+					      &n_cut_size, n_new_file_size);
+		if (c_mode == M_CONVERT) {
+			/* convert last unformatted node to direct item or leave
+			   tail in the unformatted node */
+			RFALSE(n_ret_value != CARRY_ON,
+			       "PAP-5570: can not convert twice");
+
+			n_ret_value =
+			    maybe_indirect_to_direct(th, p_s_inode, page,
+						     p_s_path, p_s_item_key,
+						     n_new_file_size, &c_mode);
+			if (c_mode == M_SKIP_BALANCING)
+				/* tail has been left in the unformatted node */
+				return n_ret_value;
+
+			n_is_inode_locked = 1;
+
+			/* removing of last unformatted node will change value we
+			   have to return to truncate. Save it */
+			retval2 = n_ret_value;
+			/*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1)); */
+
+			/* So, we have performed the first part of the conversion:
+			   inserting the new direct item.  Now we are removing the
+			   last unformatted node pointer. Set key to search for
+			   it. */
+			set_cpu_key_k_type(p_s_item_key, TYPE_INDIRECT);
+			p_s_item_key->key_length = 4;
+			n_new_file_size -=
+			    (n_new_file_size & (p_s_sb->s_blocksize - 1));
+			tail_pos = n_new_file_size;
+			set_cpu_key_k_offset(p_s_item_key, n_new_file_size + 1);
+			if (search_for_position_by_key
+			    (p_s_sb, p_s_item_key,
+			     p_s_path) == POSITION_NOT_FOUND) {
+				print_block(PATH_PLAST_BUFFER(p_s_path), 3,
+					    PATH_LAST_POSITION(p_s_path) - 1,
+					    PATH_LAST_POSITION(p_s_path) + 1);
+				reiserfs_panic(p_s_sb,
+					       "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%K)",
+					       p_s_item_key);
+			}
+			continue;
+		}
+		if (n_cut_size == 0) {
+			pathrelse(p_s_path);
+			return 0;
+		}
+
+		s_cut_balance.insert_size[0] = n_cut_size;
+
+		n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL);
+		if (n_ret_value != REPEAT_SEARCH)
+			break;
+
+		PROC_INFO_INC(p_s_sb, cut_from_item_restarted);
+
+		n_ret_value =
+		    search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
+		if (n_ret_value == POSITION_FOUND)
+			continue;
 
-	s_cut_balance.insert_size[0] = n_cut_size;
-	
-	n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, NULL);
-      	if ( n_ret_value != REPEAT_SEARCH )
-	    break;
-	
-	PROC_INFO_INC( p_s_sb, cut_from_item_restarted );
-
-	n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path);
-	if (n_ret_value == POSITION_FOUND)
-	    continue;
-
-	reiserfs_warning (p_s_sb, "PAP-5610: reiserfs_cut_from_item: item %K not found", p_s_item_key);
-	unfix_nodes (&s_cut_balance);
-	return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
-    } /* while */
-  
-    // check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
-    if ( n_ret_value != CARRY_ON ) {
-	if ( n_is_inode_locked ) {
-	    // FIXME: this seems to be not needed: we are always able
-	    // to cut item
-	    indirect_to_direct_roll_back (th, p_s_inode, p_s_path);
+		reiserfs_warning(p_s_sb,
+				 "PAP-5610: reiserfs_cut_from_item: item %K not found",
+				 p_s_item_key);
+		unfix_nodes(&s_cut_balance);
+		return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT;
+	}			/* while */
+
+	// check fix_nodes results (IO_ERROR or NO_DISK_SPACE)
+	if (n_ret_value != CARRY_ON) {
+		if (n_is_inode_locked) {
+			// FIXME: this seems to be not needed: we are always able
+			// to cut item
+			indirect_to_direct_roll_back(th, p_s_inode, p_s_path);
+		}
+		if (n_ret_value == NO_DISK_SPACE)
+			reiserfs_warning(p_s_sb, "NO_DISK_SPACE");
+		unfix_nodes(&s_cut_balance);
+		return -EIO;
 	}
-	if (n_ret_value == NO_DISK_SPACE)
-	    reiserfs_warning (p_s_sb, "NO_DISK_SPACE");
-	unfix_nodes (&s_cut_balance);
-	return -EIO;
-    }
-
-    /* go ahead and perform balancing */
-    
-    RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
-
-    /* Calculate number of bytes that need to be cut from the item. */
-    quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0];
-    if (retval2 == -1)
-	n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
-    else
-	n_ret_value = retval2;
-
-
-    /* For direct items, we only change the quota when deleting the last
-    ** item.
-    */
-    p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
-    if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
-        if (c_mode == M_DELETE &&
-	   (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) {
-	    // FIXME: this is to keep 3.5 happy
-	    REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
-	    quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ;
-        } else {
-	    quota_cut_bytes = 0 ;
+
+	/* go ahead and perform balancing */
+
+	RFALSE(c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode");
+
+	/* Calculate number of bytes that need to be cut from the item. */
+	quota_cut_bytes =
+	    (c_mode ==
+	     M_DELETE) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.
+	    insert_size[0];
+	if (retval2 == -1)
+		n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode);
+	else
+		n_ret_value = retval2;
+
+	/* For direct items, we only change the quota when deleting the last
+	 ** item.
+	 */
+	p_le_ih = PATH_PITEM_HEAD(s_cut_balance.tb_path);
+	if (!S_ISLNK(p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) {
+		if (c_mode == M_DELETE &&
+		    (le_ih_k_offset(p_le_ih) & (p_s_sb->s_blocksize - 1)) ==
+		    1) {
+			// FIXME: this is to keep 3.5 happy
+			REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX;
+			quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE;
+		} else {
+			quota_cut_bytes = 0;
+		}
 	}
-    }
 #ifdef CONFIG_REISERFS_CHECK
-    if (n_is_inode_locked) {
-	struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path);
-	/* we are going to complete indirect2direct conversion. Make
-           sure, that we exactly remove last unformatted node pointer
-           of the item */
-	if (!is_indirect_le_ih (le_ih))
-	    reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: "
-			    "item must be indirect %h", le_ih);
-
-	if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
-	    reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: "
-			    "completing indirect2direct conversion indirect item %h "
-			    "being deleted must be of 4 byte long", le_ih);
-
-	if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
-	    reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: "
-			    "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
-			    le_ih, s_cut_balance.insert_size[0]);
+	if (n_is_inode_locked) {
+		struct item_head *le_ih =
+		    PATH_PITEM_HEAD(s_cut_balance.tb_path);
+		/* we are going to complete indirect2direct conversion. Make
+		   sure, that we exactly remove last unformatted node pointer
+		   of the item */
+		if (!is_indirect_le_ih(le_ih))
+			reiserfs_panic(p_s_sb,
+				       "vs-5652: reiserfs_cut_from_item: "
+				       "item must be indirect %h", le_ih);
+
+		if (c_mode == M_DELETE && ih_item_len(le_ih) != UNFM_P_SIZE)
+			reiserfs_panic(p_s_sb,
+				       "vs-5653: reiserfs_cut_from_item: "
+				       "completing indirect2direct conversion indirect item %h "
+				       "being deleted must be of 4 byte long",
+				       le_ih);
+
+		if (c_mode == M_CUT
+		    && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) {
+			reiserfs_panic(p_s_sb,
+				       "vs-5654: reiserfs_cut_from_item: "
+				       "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)",
+				       le_ih, s_cut_balance.insert_size[0]);
+		}
+		/* it would be useful to make sure, that right neighboring
+		   item is direct item of this file */
 	}
-	/* it would be useful to make sure, that right neighboring
-           item is direct item of this file */
-    }
 #endif
-    
-    do_balance(&s_cut_balance, NULL, NULL, c_mode);
-    if ( n_is_inode_locked ) {
-	/* we've done an indirect->direct conversion.  when the data block
-	** was freed, it was removed from the list of blocks that must
-	** be flushed before the transaction commits, make sure to
-	** unmap and invalidate it
-	*/
-	unmap_buffers(page, tail_pos);
-	REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ;
-    }
+
+	do_balance(&s_cut_balance, NULL, NULL, c_mode);
+	if (n_is_inode_locked) {
+		/* we've done an indirect->direct conversion.  when the data block
+		 ** was freed, it was removed from the list of blocks that must
+		 ** be flushed before the transaction commits, make sure to
+		 ** unmap and invalidate it
+		 */
+		unmap_buffers(page, tail_pos);
+		REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask;
+	}
 #ifdef REISERQUOTA_DEBUG
-    reiserfs_debug (p_s_inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota cut_from_item(): freeing %u id=%u type=%c", quota_cut_bytes, p_s_inode->i_uid, '?');
+	reiserfs_debug(p_s_inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota cut_from_item(): freeing %u id=%u type=%c",
+		       quota_cut_bytes, p_s_inode->i_uid, '?');
 #endif
-    DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
-    return n_ret_value;
+	DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes);
+	return n_ret_value;
 }
 
-static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode)
+static void truncate_directory(struct reiserfs_transaction_handle *th,
+			       struct inode *inode)
 {
-    BUG_ON (!th->t_trans_id);
-    if (inode->i_nlink)
-	reiserfs_warning (inode->i_sb,
-			  "vs-5655: truncate_directory: link count != 0");
-
-    set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET);
-    set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY);
-    reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode));
-    reiserfs_update_sd(th, inode) ;
-    set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET);
-    set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA);    
+	BUG_ON(!th->t_trans_id);
+	if (inode->i_nlink)
+		reiserfs_warning(inode->i_sb,
+				 "vs-5655: truncate_directory: link count != 0");
+
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), DOT_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_DIRENTRY);
+	reiserfs_delete_solid_item(th, inode, INODE_PKEY(inode));
+	reiserfs_update_sd(th, inode);
+	set_le_key_k_offset(KEY_FORMAT_3_5, INODE_PKEY(inode), SD_OFFSET);
+	set_le_key_k_type(KEY_FORMAT_3_5, INODE_PKEY(inode), TYPE_STAT_DATA);
 }
 
+/* Truncate file to the new size. Note, this must be called with a transaction
+   already started */
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, struct inode *p_s_inode,	/* ->i_size contains new
+												   size */
+			 struct page *page,	/* up to date for last block */
+			 int update_timestamps	/* when it is called by
+						   file_release to convert
+						   the tail - no timestamps
+						   should be updated */
+    )
+{
+	INITIALIZE_PATH(s_search_path);	/* Path to the current object item. */
+	struct item_head *p_le_ih;	/* Pointer to an item header. */
+	struct cpu_key s_item_key;	/* Key to search for a previous file item. */
+	loff_t n_file_size,	/* Old file size. */
+	 n_new_file_size;	/* New file size. */
+	int n_deleted;		/* Number of deleted or truncated bytes. */
+	int retval;
+	int err = 0;
+
+	BUG_ON(!th->t_trans_id);
+	if (!
+	    (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode)
+	     || S_ISLNK(p_s_inode->i_mode)))
+		return 0;
+
+	if (S_ISDIR(p_s_inode->i_mode)) {
+		// deletion of directory - no need to update timestamps
+		truncate_directory(th, p_s_inode);
+		return 0;
+	}
 
+	/* Get new file size. */
+	n_new_file_size = p_s_inode->i_size;
 
+	// FIXME: note, that key type is unimportant here
+	make_cpu_key(&s_item_key, p_s_inode, max_reiserfs_offset(p_s_inode),
+		     TYPE_DIRECT, 3);
 
-/* Truncate file to the new size. Note, this must be called with a transaction
-   already started */
-int reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
-			   struct  inode * p_s_inode, /* ->i_size contains new
-                                                         size */
-			   struct page *page, /* up to date for last block */
-			   int update_timestamps  /* when it is called by
-						     file_release to convert
-						     the tail - no timestamps
-						     should be updated */
-    ) {
-    INITIALIZE_PATH (s_search_path);       /* Path to the current object item. */
-    struct item_head    * p_le_ih;         /* Pointer to an item header. */
-    struct cpu_key      s_item_key;     /* Key to search for a previous file item. */
-    loff_t         n_file_size,    /* Old file size. */
-	n_new_file_size;/* New file size. */
-    int                   n_deleted;      /* Number of deleted or truncated bytes. */
-    int retval;
-    int err = 0;
-
-    BUG_ON (!th->t_trans_id);
-    if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) )
-	return 0;
+	retval =
+	    search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+				       &s_search_path);
+	if (retval == IO_ERROR) {
+		reiserfs_warning(p_s_inode->i_sb,
+				 "vs-5657: reiserfs_do_truncate: "
+				 "i/o failure occurred trying to truncate %K",
+				 &s_item_key);
+		err = -EIO;
+		goto out;
+	}
+	if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
+		reiserfs_warning(p_s_inode->i_sb,
+				 "PAP-5660: reiserfs_do_truncate: "
+				 "wrong result %d of search for %K", retval,
+				 &s_item_key);
+
+		err = -EIO;
+		goto out;
+	}
 
-    if (S_ISDIR(p_s_inode->i_mode)) {
-	// deletion of directory - no need to update timestamps
-	truncate_directory (th, p_s_inode);
-	return 0;
-    }
-
-    /* Get new file size. */
-    n_new_file_size = p_s_inode->i_size;
-
-    // FIXME: note, that key type is unimportant here
-    make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3);
-
-    retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path);
-    if (retval == IO_ERROR) {
-	reiserfs_warning (p_s_inode->i_sb, "vs-5657: reiserfs_do_truncate: "
-			  "i/o failure occurred trying to truncate %K", &s_item_key);
-        err = -EIO;
-        goto out;
-    }
-    if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) {
-	reiserfs_warning (p_s_inode->i_sb, "PAP-5660: reiserfs_do_truncate: "
-			  "wrong result %d of search for %K", retval, &s_item_key);
-
-        err = -EIO;
-        goto out;
-    }
-
-    s_search_path.pos_in_item --;
-
-    /* Get real file size (total length of all file items) */
-    p_le_ih = PATH_PITEM_HEAD(&s_search_path);
-    if ( is_statdata_le_ih (p_le_ih) )
-	n_file_size = 0;
-    else {
-	loff_t offset = le_ih_k_offset (p_le_ih);
-	int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize);
-
-	/* this may mismatch with real file size: if last direct item
-           had no padding zeros and last unformatted node had no free
-           space, this file would have this file size */
-	n_file_size = offset + bytes - 1;
-    }
-    /*
-     * are we doing a full truncate or delete, if so
-     * kick in the reada code
-     */
-    if (n_new_file_size == 0)
-        s_search_path.reada = PATH_READA | PATH_READA_BACK;
-
-    if ( n_file_size == 0 || n_file_size < n_new_file_size ) {
-	goto update_and_out ;
-    }
-
-    /* Update key to search for the last file item. */
-    set_cpu_key_k_offset (&s_item_key, n_file_size);
-
-    do  {
-	/* Cut or delete file item. */
-	n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode,  page, n_new_file_size);
-	if (n_deleted < 0) {
-	    reiserfs_warning (p_s_inode->i_sb, "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
-	    reiserfs_check_path(&s_search_path) ;
-	    return 0;
+	s_search_path.pos_in_item--;
+
+	/* Get real file size (total length of all file items) */
+	p_le_ih = PATH_PITEM_HEAD(&s_search_path);
+	if (is_statdata_le_ih(p_le_ih))
+		n_file_size = 0;
+	else {
+		loff_t offset = le_ih_k_offset(p_le_ih);
+		int bytes =
+		    op_bytes_number(p_le_ih, p_s_inode->i_sb->s_blocksize);
+
+		/* this may mismatch with real file size: if last direct item
+		   had no padding zeros and last unformatted node had no free
+		   space, this file would have this file size */
+		n_file_size = offset + bytes - 1;
+	}
+	/*
+	 * are we doing a full truncate or delete, if so
+	 * kick in the reada code
+	 */
+	if (n_new_file_size == 0)
+		s_search_path.reada = PATH_READA | PATH_READA_BACK;
+
+	if (n_file_size == 0 || n_file_size < n_new_file_size) {
+		goto update_and_out;
 	}
 
-	RFALSE( n_deleted > n_file_size,
-		"PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
-		n_deleted, n_file_size, &s_item_key);
+	/* Update key to search for the last file item. */
+	set_cpu_key_k_offset(&s_item_key, n_file_size);
+
+	do {
+		/* Cut or delete file item. */
+		n_deleted =
+		    reiserfs_cut_from_item(th, &s_search_path, &s_item_key,
+					   p_s_inode, page, n_new_file_size);
+		if (n_deleted < 0) {
+			reiserfs_warning(p_s_inode->i_sb,
+					 "vs-5665: reiserfs_do_truncate: reiserfs_cut_from_item failed");
+			reiserfs_check_path(&s_search_path);
+			return 0;
+		}
 
-	/* Change key to search the last file item. */
-	n_file_size -= n_deleted;
+		RFALSE(n_deleted > n_file_size,
+		       "PAP-5670: reiserfs_cut_from_item: too many bytes deleted: deleted %d, file_size %lu, item_key %K",
+		       n_deleted, n_file_size, &s_item_key);
 
-	set_cpu_key_k_offset (&s_item_key, n_file_size);
+		/* Change key to search the last file item. */
+		n_file_size -= n_deleted;
 
-	/* While there are bytes to truncate and previous file item is presented in the tree. */
+		set_cpu_key_k_offset(&s_item_key, n_file_size);
 
-	/*
-	** This loop could take a really long time, and could log 
-	** many more blocks than a transaction can hold.  So, we do a polite
-	** journal end here, and if the transaction needs ending, we make
-	** sure the file is consistent before ending the current trans
-	** and starting a new one
-	*/
-        if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
-	  int orig_len_alloc = th->t_blocks_allocated ;
-	  decrement_counters_in_path(&s_search_path) ;
-
-	  if (update_timestamps) {
-	      p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
-	  } 
-	  reiserfs_update_sd(th, p_s_inode) ;
-
-	  err = journal_end(th, p_s_inode->i_sb, orig_len_alloc) ;
-	  if (err)
-	    goto out;
-	  err = journal_begin (th, p_s_inode->i_sb,
-                               JOURNAL_PER_BALANCE_CNT * 6);
-	  if (err)
-	    goto out;
-	  reiserfs_update_inode_transaction(p_s_inode) ;
+		/* While there are bytes to truncate and previous file item is presented in the tree. */
+
+		/*
+		 ** This loop could take a really long time, and could log 
+		 ** many more blocks than a transaction can hold.  So, we do a polite
+		 ** journal end here, and if the transaction needs ending, we make
+		 ** sure the file is consistent before ending the current trans
+		 ** and starting a new one
+		 */
+		if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+			int orig_len_alloc = th->t_blocks_allocated;
+			decrement_counters_in_path(&s_search_path);
+
+			if (update_timestamps) {
+				p_s_inode->i_mtime = p_s_inode->i_ctime =
+				    CURRENT_TIME_SEC;
+			}
+			reiserfs_update_sd(th, p_s_inode);
+
+			err = journal_end(th, p_s_inode->i_sb, orig_len_alloc);
+			if (err)
+				goto out;
+			err = journal_begin(th, p_s_inode->i_sb,
+					    JOURNAL_PER_BALANCE_CNT * 6);
+			if (err)
+				goto out;
+			reiserfs_update_inode_transaction(p_s_inode);
+		}
+	} while (n_file_size > ROUND_UP(n_new_file_size) &&
+		 search_for_position_by_key(p_s_inode->i_sb, &s_item_key,
+					    &s_search_path) == POSITION_FOUND);
+
+	RFALSE(n_file_size > ROUND_UP(n_new_file_size),
+	       "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
+	       n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid);
+
+      update_and_out:
+	if (update_timestamps) {
+		// this is truncate, not file closing
+		p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
 	}
-    } while ( n_file_size > ROUND_UP (n_new_file_size) &&
-	      search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND )  ;
-
-    RFALSE( n_file_size > ROUND_UP (n_new_file_size),
-	    "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d",
-	    n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid);
-
-update_and_out:
-    if (update_timestamps) {
-	// this is truncate, not file closing
-	    p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME_SEC;
-    }
-    reiserfs_update_sd (th, p_s_inode);
-
-out:
-    pathrelse(&s_search_path) ;
-    return err;
-}
+	reiserfs_update_sd(th, p_s_inode);
 
+      out:
+	pathrelse(&s_search_path);
+	return err;
+}
 
 #ifdef CONFIG_REISERFS_CHECK
 // this makes sure, that we __append__, not overwrite or add holes
-static void check_research_for_paste (struct path * path, 
-				      const struct cpu_key * p_s_key)
+static void check_research_for_paste(struct path *path,
+				     const struct cpu_key *p_s_key)
 {
-    struct item_head * found_ih = get_ih (path);
-    
-    if (is_direct_le_ih (found_ih)) {
-	if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) !=
-	    cpu_key_k_offset (p_s_key) ||
-	    op_bytes_number (found_ih, get_last_bh (path)->b_size) != pos_in_item (path))
-	    reiserfs_panic (NULL, "PAP-5720: check_research_for_paste: "
-			    "found direct item %h or position (%d) does not match to key %K",
-			    found_ih, pos_in_item (path), p_s_key);
-    }
-    if (is_indirect_le_ih (found_ih)) {
-	if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_last_bh (path)->b_size) != cpu_key_k_offset (p_s_key) || 
-	    I_UNFM_NUM (found_ih) != pos_in_item (path) ||
-	    get_ih_free_space (found_ih) != 0)
-	    reiserfs_panic (NULL, "PAP-5730: check_research_for_paste: "
-			    "found indirect item (%h) or position (%d) does not match to key (%K)",
-			    found_ih, pos_in_item (path), p_s_key);
-    }
+	struct item_head *found_ih = get_ih(path);
+
+	if (is_direct_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(p_s_key)
+		    || op_bytes_number(found_ih,
+				       get_last_bh(path)->b_size) !=
+		    pos_in_item(path))
+			reiserfs_panic(NULL,
+				       "PAP-5720: check_research_for_paste: "
+				       "found direct item %h or position (%d) does not match to key %K",
+				       found_ih, pos_in_item(path), p_s_key);
+	}
+	if (is_indirect_le_ih(found_ih)) {
+		if (le_ih_k_offset(found_ih) +
+		    op_bytes_number(found_ih,
+				    get_last_bh(path)->b_size) !=
+		    cpu_key_k_offset(p_s_key)
+		    || I_UNFM_NUM(found_ih) != pos_in_item(path)
+		    || get_ih_free_space(found_ih) != 0)
+			reiserfs_panic(NULL,
+				       "PAP-5730: check_research_for_paste: "
+				       "found indirect item (%h) or position (%d) does not match to key (%K)",
+				       found_ih, pos_in_item(path), p_s_key);
+	}
 }
-#endif /* config reiserfs check */
-
+#endif				/* config reiserfs check */
 
 /* Paste bytes to the existing item. Returns bytes number pasted into the item. */
-int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, 
-			      struct path         * p_s_search_path,	/* Path to the pasted item.          */
-			      const struct cpu_key      * p_s_key,        	/* Key to search for the needed item.*/
-			      struct inode	  * inode,		/* Inode item belongs to */
-			      const char          * p_c_body,       	/* Pointer to the bytes to paste.    */
-			      int                   n_pasted_size)  	/* Size of pasted bytes.             */
-{
-    struct tree_balance s_paste_balance;
-    int                 retval;
-    int			fs_gen;
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct path *p_s_search_path,	/* Path to the pasted item.          */
+			     const struct cpu_key *p_s_key,	/* Key to search for the needed item. */
+			     struct inode *inode,	/* Inode item belongs to */
+			     const char *p_c_body,	/* Pointer to the bytes to paste.    */
+			     int n_pasted_size)
+{				/* Size of pasted bytes.             */
+	struct tree_balance s_paste_balance;
+	int retval;
+	int fs_gen;
+
+	BUG_ON(!th->t_trans_id);
 
-    BUG_ON (!th->t_trans_id);
-
-    fs_gen = get_generation(inode->i_sb) ;
+	fs_gen = get_generation(inode->i_sb);
 
 #ifdef REISERQUOTA_DEBUG
-    reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): allocating %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): allocating %u id=%u type=%c",
+		       n_pasted_size, inode->i_uid,
+		       key2type(&(p_s_key->on_disk_key)));
 #endif
 
-    if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
-	pathrelse(p_s_search_path);
-	return -EDQUOT;
-    }
-    init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size);
+	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) {
+		pathrelse(p_s_search_path);
+		return -EDQUOT;
+	}
+	init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path,
+		       n_pasted_size);
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    s_paste_balance.key = p_s_key->on_disk_key;
+	s_paste_balance.key = p_s_key->on_disk_key;
 #endif
 
-    /* DQUOT_* can schedule, must check before the fix_nodes */
-    if (fs_changed(fs_gen, inode->i_sb)) {
-	goto search_again;
-    }
-
-    while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) ==
-REPEAT_SEARCH ) {
-search_again:
-	/* file system changed while we were in the fix_nodes */
-	PROC_INFO_INC( th -> t_super, paste_into_item_restarted );
-	retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path);
-	if (retval == IO_ERROR) {
-	    retval = -EIO ;
-	    goto error_out ;
+	/* DQUOT_* can schedule, must check before the fix_nodes */
+	if (fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
 	}
-	if (retval == POSITION_FOUND) {
-	    reiserfs_warning (inode->i_sb, "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key);
-	    retval = -EEXIST ;
-	    goto error_out ;
-	}
-	
+
+	while ((retval =
+		fix_nodes(M_PASTE, &s_paste_balance, NULL,
+			  p_c_body)) == REPEAT_SEARCH) {
+	      search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, paste_into_item_restarted);
+		retval =
+		    search_for_position_by_key(th->t_super, p_s_key,
+					       p_s_search_path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == POSITION_FOUND) {
+			reiserfs_warning(inode->i_sb,
+					 "PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists",
+					 p_s_key);
+			retval = -EEXIST;
+			goto error_out;
+		}
 #ifdef CONFIG_REISERFS_CHECK
-	check_research_for_paste (p_s_search_path, p_s_key);
+		check_research_for_paste(p_s_search_path, p_s_key);
 #endif
-    }
+	}
 
-    /* Perform balancing after all resources are collected by fix_nodes, and
-       accessing them will not risk triggering schedule. */
-    if ( retval == CARRY_ON ) {
-	do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE);
-	return 0;
-    }
-    retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-    /* this also releases the path */
-    unfix_nodes(&s_paste_balance);
+	/* Perform balancing after all resources are collected by fix_nodes, and
+	   accessing them will not risk triggering schedule. */
+	if (retval == CARRY_ON) {
+		do_balance(&s_paste_balance, NULL /*ih */ , p_c_body, M_PASTE);
+		return 0;
+	}
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+      error_out:
+	/* this also releases the path */
+	unfix_nodes(&s_paste_balance);
 #ifdef REISERQUOTA_DEBUG
-    reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota paste_into_item(): freeing %u id=%u type=%c", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key)));
+	reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+		       "reiserquota paste_into_item(): freeing %u id=%u type=%c",
+		       n_pasted_size, inode->i_uid,
+		       key2type(&(p_s_key->on_disk_key)));
 #endif
-    DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
-    return retval ;
+	DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size);
+	return retval;
 }
 
-
 /* Insert new item into the buffer at the path. */
-int reiserfs_insert_item(struct reiserfs_transaction_handle *th, 
-			 struct path         * 	p_s_path,         /* Path to the inserteded item.         */
-			 const struct cpu_key      * key,
-			 struct item_head    * 	p_s_ih,           /* Pointer to the item header to insert.*/
-			 struct inode        * inode,
-			 const char          * 	p_c_body)         /* Pointer to the bytes to insert.      */
-{
-    struct tree_balance s_ins_balance;
-    int                 retval;
-    int fs_gen = 0 ;
-    int quota_bytes = 0 ;
-
-    BUG_ON (!th->t_trans_id);
-
-    if (inode) {      /* Do we count quotas for item? */
-	fs_gen = get_generation(inode->i_sb);
-	quota_bytes = ih_item_len(p_s_ih);
-
-	/* hack so the quota code doesn't have to guess if the file has
-	 ** a tail, links are always tails, so there's no guessing needed
-	 */
-	if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) {
-	    quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ;
-	}
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th, struct path *p_s_path,	/* Path to the inserteded item.         */
+			 const struct cpu_key *key, struct item_head *p_s_ih,	/* Pointer to the item header to insert. */
+			 struct inode *inode, const char *p_c_body)
+{				/* Pointer to the bytes to insert.      */
+	struct tree_balance s_ins_balance;
+	int retval;
+	int fs_gen = 0;
+	int quota_bytes = 0;
+
+	BUG_ON(!th->t_trans_id);
+
+	if (inode) {		/* Do we count quotas for item? */
+		fs_gen = get_generation(inode->i_sb);
+		quota_bytes = ih_item_len(p_s_ih);
+
+		/* hack so the quota code doesn't have to guess if the file has
+		 ** a tail, links are always tails, so there's no guessing needed
+		 */
+		if (!S_ISLNK(inode->i_mode) && is_direct_le_ih(p_s_ih)) {
+			quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE;
+		}
 #ifdef REISERQUOTA_DEBUG
-	reiserfs_debug (inode->i_sb, REISERFS_DEBUG_CODE, "reiserquota insert_item(): allocating %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
+		reiserfs_debug(inode->i_sb, REISERFS_DEBUG_CODE,
+			       "reiserquota insert_item(): allocating %u id=%u type=%c",
+			       quota_bytes, inode->i_uid, head2type(p_s_ih));
 #endif
-	/* We can't dirty inode here. It would be immediately written but
-	 * appropriate stat item isn't inserted yet... */
-	if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
-	    pathrelse(p_s_path);
-	    return -EDQUOT;
+		/* We can't dirty inode here. It would be immediately written but
+		 * appropriate stat item isn't inserted yet... */
+		if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) {
+			pathrelse(p_s_path);
+			return -EDQUOT;
+		}
 	}
-    }
-    init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih));
+	init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path,
+		       IH_SIZE + ih_item_len(p_s_ih));
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-    s_ins_balance.key = key->on_disk_key;
+	s_ins_balance.key = key->on_disk_key;
 #endif
-    /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
-    if (inode && fs_changed(fs_gen, inode->i_sb)) {
-	goto search_again;
-    }
-
-    while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) {
-search_again:
-	/* file system changed while we were in the fix_nodes */
-	PROC_INFO_INC( th -> t_super, insert_item_restarted );
-	retval = search_item (th->t_super, key, p_s_path);
-	if (retval == IO_ERROR) {
-	    retval = -EIO;
-	    goto error_out ;
+	/* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */
+	if (inode && fs_changed(fs_gen, inode->i_sb)) {
+		goto search_again;
 	}
-	if (retval == ITEM_FOUND) {
-	    reiserfs_warning (th->t_super, "PAP-5760: reiserfs_insert_item: "
-			      "key %K already exists in the tree", key);
-	    retval = -EEXIST ;
-	    goto error_out; 
+
+	while ((retval =
+		fix_nodes(M_INSERT, &s_ins_balance, p_s_ih,
+			  p_c_body)) == REPEAT_SEARCH) {
+	      search_again:
+		/* file system changed while we were in the fix_nodes */
+		PROC_INFO_INC(th->t_super, insert_item_restarted);
+		retval = search_item(th->t_super, key, p_s_path);
+		if (retval == IO_ERROR) {
+			retval = -EIO;
+			goto error_out;
+		}
+		if (retval == ITEM_FOUND) {
+			reiserfs_warning(th->t_super,
+					 "PAP-5760: reiserfs_insert_item: "
+					 "key %K already exists in the tree",
+					 key);
+			retval = -EEXIST;
+			goto error_out;
+		}
 	}
-    }
 
-    /* make balancing after all resources will be collected at a time */ 
-    if ( retval == CARRY_ON ) {
-	do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
-	return 0;
-    }
+	/* make balancing after all resources will be collected at a time */
+	if (retval == CARRY_ON) {
+		do_balance(&s_ins_balance, p_s_ih, p_c_body, M_INSERT);
+		return 0;
+	}
 
-    retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
-error_out:
-    /* also releases the path */
-    unfix_nodes(&s_ins_balance);
+	retval = (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO;
+      error_out:
+	/* also releases the path */
+	unfix_nodes(&s_ins_balance);
 #ifdef REISERQUOTA_DEBUG
-    reiserfs_debug (th->t_super, REISERFS_DEBUG_CODE, "reiserquota insert_item(): freeing %u id=%u type=%c", quota_bytes, inode->i_uid, head2type(p_s_ih));
+	reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+		       "reiserquota insert_item(): freeing %u id=%u type=%c",
+		       quota_bytes, inode->i_uid, head2type(p_s_ih));
 #endif
-    if (inode)
-	DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ;
-    return retval; 
+	if (inode)
+		DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes);
+	return retval;
 }
-
-
-
-
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4b80ab95d338..6951c35755be 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -35,83 +35,81 @@ static const char reiserfs_3_5_magic_string[] = REISERFS_SUPER_MAGIC_STRING;
 static const char reiserfs_3_6_magic_string[] = REISER2FS_SUPER_MAGIC_STRING;
 static const char reiserfs_jr_magic_string[] = REISER2FS_JR_SUPER_MAGIC_STRING;
 
-int is_reiserfs_3_5 (struct reiserfs_super_block * rs)
+int is_reiserfs_3_5(struct reiserfs_super_block *rs)
 {
-  return !strncmp (rs->s_v1.s_magic, reiserfs_3_5_magic_string,
-		   strlen (reiserfs_3_5_magic_string));
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_5_magic_string,
+			strlen(reiserfs_3_5_magic_string));
 }
 
-
-int is_reiserfs_3_6 (struct reiserfs_super_block * rs)
+int is_reiserfs_3_6(struct reiserfs_super_block *rs)
 {
-  return !strncmp (rs->s_v1.s_magic, reiserfs_3_6_magic_string,
- 		   strlen (reiserfs_3_6_magic_string));
+	return !strncmp(rs->s_v1.s_magic, reiserfs_3_6_magic_string,
+			strlen(reiserfs_3_6_magic_string));
 }
 
-
-int is_reiserfs_jr (struct reiserfs_super_block * rs)
+int is_reiserfs_jr(struct reiserfs_super_block *rs)
 {
-  return !strncmp (rs->s_v1.s_magic, reiserfs_jr_magic_string,
- 		   strlen (reiserfs_jr_magic_string));
+	return !strncmp(rs->s_v1.s_magic, reiserfs_jr_magic_string,
+			strlen(reiserfs_jr_magic_string));
 }
 
-
-static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs)
+static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs)
 {
-  return (is_reiserfs_3_5 (rs) || is_reiserfs_3_6 (rs) ||
-	  is_reiserfs_jr (rs));
+	return (is_reiserfs_3_5(rs) || is_reiserfs_3_6(rs) ||
+		is_reiserfs_jr(rs));
 }
 
-static int reiserfs_remount (struct super_block * s, int * flags, char * data);
-static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
+static int reiserfs_remount(struct super_block *s, int *flags, char *data);
+static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf);
 
-static int reiserfs_sync_fs (struct super_block * s, int wait)
+static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-    if (!(s->s_flags & MS_RDONLY)) {
-        struct reiserfs_transaction_handle th;
-	reiserfs_write_lock(s);
-	if (!journal_begin(&th, s, 1))
-            if (!journal_end_sync(&th, s, 1))
-                reiserfs_flush_old_commits(s);
-	s->s_dirt = 0; /* Even if it's not true.
-                        * We'll loop forever in sync_supers otherwise */
-	reiserfs_write_unlock(s);
-    } else {
-        s->s_dirt = 0;
-    }
-    return 0;
+	if (!(s->s_flags & MS_RDONLY)) {
+		struct reiserfs_transaction_handle th;
+		reiserfs_write_lock(s);
+		if (!journal_begin(&th, s, 1))
+			if (!journal_end_sync(&th, s, 1))
+				reiserfs_flush_old_commits(s);
+		s->s_dirt = 0;	/* Even if it's not true.
+				 * We'll loop forever in sync_supers otherwise */
+		reiserfs_write_unlock(s);
+	} else {
+		s->s_dirt = 0;
+	}
+	return 0;
 }
 
 static void reiserfs_write_super(struct super_block *s)
 {
-    reiserfs_sync_fs(s, 1);
+	reiserfs_sync_fs(s, 1);
 }
 
-static void reiserfs_write_super_lockfs (struct super_block * s)
+static void reiserfs_write_super_lockfs(struct super_block *s)
 {
-  struct reiserfs_transaction_handle th ;
-  reiserfs_write_lock(s);
-  if (!(s->s_flags & MS_RDONLY)) {
-    int err = journal_begin(&th, s, 1) ;
-    if (err) {
-        reiserfs_block_writes(&th) ;
-    } else {
-        reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
-        journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-        reiserfs_block_writes(&th) ;
-        journal_end_sync(&th, s, 1) ;
-    }
-  }
-  s->s_dirt = 0;
-  reiserfs_write_unlock(s);
+	struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	if (!(s->s_flags & MS_RDONLY)) {
+		int err = journal_begin(&th, s, 1);
+		if (err) {
+			reiserfs_block_writes(&th);
+		} else {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+			reiserfs_block_writes(&th);
+			journal_end_sync(&th, s, 1);
+		}
+	}
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
 }
 
-static void reiserfs_unlockfs(struct super_block *s) {
-  reiserfs_allow_writes(s) ;
+static void reiserfs_unlockfs(struct super_block *s)
+{
+	reiserfs_allow_writes(s);
 }
 
-extern const struct in_core_key  MAX_IN_CORE_KEY;
-
+extern const struct in_core_key MAX_IN_CORE_KEY;
 
 /* this is used to delete "save link" when there are no items of a
    file it points to. It can either happen if unlink is completed but
@@ -120,364 +118,387 @@ extern const struct in_core_key  MAX_IN_CORE_KEY;
    protecting unlink is bigger that a key lf "save link" which
    protects truncate), so there left no items to make truncate
    completion on */
-static int remove_save_link_only (struct super_block * s, struct reiserfs_key * key, int oid_free)
+static int remove_save_link_only(struct super_block *s,
+				 struct reiserfs_key *key, int oid_free)
 {
-    struct reiserfs_transaction_handle th;
-    int err;
-
-     /* we are going to do one balancing */
-     err = journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT);
-     if (err)
-        return err;
- 
-     reiserfs_delete_solid_item (&th, NULL, key);
-     if (oid_free)
-        /* removals are protected by direct items */
-        reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid));
-
-     return journal_end (&th, s, JOURNAL_PER_BALANCE_CNT);
+	struct reiserfs_transaction_handle th;
+	int err;
+
+	/* we are going to do one balancing */
+	err = journal_begin(&th, s, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	reiserfs_delete_solid_item(&th, NULL, key);
+	if (oid_free)
+		/* removals are protected by direct items */
+		reiserfs_release_objectid(&th, le32_to_cpu(key->k_objectid));
+
+	return journal_end(&th, s, JOURNAL_PER_BALANCE_CNT);
 }
- 
+
 #ifdef CONFIG_QUOTA
 static int reiserfs_quota_on_mount(struct super_block *, int);
 #endif
- 
+
 /* look for uncompleted unlinks and truncates and complete them */
-static int finish_unfinished (struct super_block * s)
+static int finish_unfinished(struct super_block *s)
 {
-    INITIALIZE_PATH (path);
-    struct cpu_key max_cpu_key, obj_key;
-    struct reiserfs_key save_link_key;
-    int retval = 0;
-    struct item_head * ih;
-    struct buffer_head * bh;
-    int item_pos;
-    char * item;
-    int done;
-    struct inode * inode;
-    int truncate;
+	INITIALIZE_PATH(path);
+	struct cpu_key max_cpu_key, obj_key;
+	struct reiserfs_key save_link_key;
+	int retval = 0;
+	struct item_head *ih;
+	struct buffer_head *bh;
+	int item_pos;
+	char *item;
+	int done;
+	struct inode *inode;
+	int truncate;
 #ifdef CONFIG_QUOTA
-    int i;
-    int ms_active_set;
+	int i;
+	int ms_active_set;
 #endif
- 
- 
-    /* compose key to look for "save" links */
-    max_cpu_key.version = KEY_FORMAT_3_5;
-    max_cpu_key.on_disk_key.k_dir_id = ~0U;
-    max_cpu_key.on_disk_key.k_objectid = ~0U;
-    set_cpu_key_k_offset (&max_cpu_key, ~0U);
-    max_cpu_key.key_length = 3;
+
+	/* compose key to look for "save" links */
+	max_cpu_key.version = KEY_FORMAT_3_5;
+	max_cpu_key.on_disk_key.k_dir_id = ~0U;
+	max_cpu_key.on_disk_key.k_objectid = ~0U;
+	set_cpu_key_k_offset(&max_cpu_key, ~0U);
+	max_cpu_key.key_length = 3;
 
 #ifdef CONFIG_QUOTA
-    /* Needed for iput() to work correctly and not trash data */
-    if (s->s_flags & MS_ACTIVE) {
-	    ms_active_set = 0;
-    } else {
-	    ms_active_set = 1;
-	    s->s_flags |= MS_ACTIVE;
-    }
-    /* Turn on quotas so that they are updated correctly */
-    for (i = 0; i < MAXQUOTAS; i++) {
-	if (REISERFS_SB(s)->s_qf_names[i]) {
-	    int ret = reiserfs_quota_on_mount(s, i);
-	    if (ret < 0)
-		reiserfs_warning(s, "reiserfs: cannot turn on journalled quota: error %d", ret);
-	}
-    }
+	/* Needed for iput() to work correctly and not trash data */
+	if (s->s_flags & MS_ACTIVE) {
+		ms_active_set = 0;
+	} else {
+		ms_active_set = 1;
+		s->s_flags |= MS_ACTIVE;
+	}
+	/* Turn on quotas so that they are updated correctly */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (REISERFS_SB(s)->s_qf_names[i]) {
+			int ret = reiserfs_quota_on_mount(s, i);
+			if (ret < 0)
+				reiserfs_warning(s,
+						 "reiserfs: cannot turn on journalled quota: error %d",
+						 ret);
+		}
+	}
 #endif
- 
-    done = 0;
-    REISERFS_SB(s)->s_is_unlinked_ok = 1;
-    while (!retval) {
-        retval = search_item (s, &max_cpu_key, &path);
-        if (retval != ITEM_NOT_FOUND) {
-            reiserfs_warning (s, "vs-2140: finish_unfinished: search_by_key returned %d",
-                              retval);
-            break;
-        }
-        
-        bh = get_last_bh (&path);
-        item_pos = get_item_pos (&path);
-        if (item_pos != B_NR_ITEMS (bh)) {
-            reiserfs_warning (s, "vs-2060: finish_unfinished: wrong position found");
-            break;
-        }
-        item_pos --;
-        ih = B_N_PITEM_HEAD (bh, item_pos);
- 
-        if (le32_to_cpu (ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
-            /* there are no "save" links anymore */
-            break;
- 
-        save_link_key = ih->ih_key;
-        if (is_indirect_le_ih (ih))
-            truncate = 1;
-        else
-            truncate = 0;
- 
-        /* reiserfs_iget needs k_dirid and k_objectid only */
-        item = B_I_PITEM (bh, ih);
-        obj_key.on_disk_key.k_dir_id = le32_to_cpu (*(__le32 *)item);
-        obj_key.on_disk_key.k_objectid = le32_to_cpu (ih->ih_key.k_objectid);
-	obj_key.on_disk_key.k_offset = 0;
-	obj_key.on_disk_key.k_type = 0;
-	
-        pathrelse (&path);
- 
-        inode = reiserfs_iget (s, &obj_key);
-        if (!inode) {
-            /* the unlink almost completed, it just did not manage to remove
-	       "save" link and release objectid */
-            reiserfs_warning (s, "vs-2180: finish_unfinished: iget failed for %K",
-                              &obj_key);
-            retval = remove_save_link_only (s, &save_link_key, 1);
-            continue;
-        }
-
-	if (!truncate && inode->i_nlink) {
-	    /* file is not unlinked */
-            reiserfs_warning (s, "vs-2185: finish_unfinished: file %K is not unlinked",
-                              &obj_key);
-            retval = remove_save_link_only (s, &save_link_key, 0);
-            continue;
-	}
-	DQUOT_INIT(inode);
-
-	if (truncate && S_ISDIR (inode->i_mode) ) {
-	    /* We got a truncate request for a dir which is impossible.
-	       The only imaginable way is to execute unfinished truncate request
-	       then boot into old kernel, remove the file and create dir with
-	       the same key. */
-	    reiserfs_warning(s, "green-2101: impossible truncate on a directory %k. Please report", INODE_PKEY (inode));
-	    retval = remove_save_link_only (s, &save_link_key, 0);
-	    truncate = 0;
-	    iput (inode); 
-	    continue;
-	}
- 
-        if (truncate) {
-            REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask;
-            /* not completed truncate found. New size was committed together
-	       with "save" link */
-            reiserfs_info (s, "Truncating %k to %Ld ..",
-                              INODE_PKEY (inode), inode->i_size);
-            reiserfs_truncate_file (inode, 0/*don't update modification time*/);
-            retval = remove_save_link (inode, truncate);
-        } else {
-            REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask;
-            /* not completed unlink (rmdir) found */
-            reiserfs_info (s, "Removing %k..", INODE_PKEY (inode));
-            /* removal gets completed in iput */
-            retval = 0;
-        }
- 
-        iput (inode);
-        printk ("done\n");
-        done ++;
-    }
-    REISERFS_SB(s)->s_is_unlinked_ok = 0;
-     
+
+	done = 0;
+	REISERFS_SB(s)->s_is_unlinked_ok = 1;
+	while (!retval) {
+		retval = search_item(s, &max_cpu_key, &path);
+		if (retval != ITEM_NOT_FOUND) {
+			reiserfs_warning(s,
+					 "vs-2140: finish_unfinished: search_by_key returned %d",
+					 retval);
+			break;
+		}
+
+		bh = get_last_bh(&path);
+		item_pos = get_item_pos(&path);
+		if (item_pos != B_NR_ITEMS(bh)) {
+			reiserfs_warning(s,
+					 "vs-2060: finish_unfinished: wrong position found");
+			break;
+		}
+		item_pos--;
+		ih = B_N_PITEM_HEAD(bh, item_pos);
+
+		if (le32_to_cpu(ih->ih_key.k_dir_id) != MAX_KEY_OBJECTID)
+			/* there are no "save" links anymore */
+			break;
+
+		save_link_key = ih->ih_key;
+		if (is_indirect_le_ih(ih))
+			truncate = 1;
+		else
+			truncate = 0;
+
+		/* reiserfs_iget needs k_dirid and k_objectid only */
+		item = B_I_PITEM(bh, ih);
+		obj_key.on_disk_key.k_dir_id = le32_to_cpu(*(__le32 *) item);
+		obj_key.on_disk_key.k_objectid =
+		    le32_to_cpu(ih->ih_key.k_objectid);
+		obj_key.on_disk_key.k_offset = 0;
+		obj_key.on_disk_key.k_type = 0;
+
+		pathrelse(&path);
+
+		inode = reiserfs_iget(s, &obj_key);
+		if (!inode) {
+			/* the unlink almost completed, it just did not manage to remove
+			   "save" link and release objectid */
+			reiserfs_warning(s,
+					 "vs-2180: finish_unfinished: iget failed for %K",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 1);
+			continue;
+		}
+
+		if (!truncate && inode->i_nlink) {
+			/* file is not unlinked */
+			reiserfs_warning(s,
+					 "vs-2185: finish_unfinished: file %K is not unlinked",
+					 &obj_key);
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			continue;
+		}
+		DQUOT_INIT(inode);
+
+		if (truncate && S_ISDIR(inode->i_mode)) {
+			/* We got a truncate request for a dir which is impossible.
+			   The only imaginable way is to execute unfinished truncate request
+			   then boot into old kernel, remove the file and create dir with
+			   the same key. */
+			reiserfs_warning(s,
+					 "green-2101: impossible truncate on a directory %k. Please report",
+					 INODE_PKEY(inode));
+			retval = remove_save_link_only(s, &save_link_key, 0);
+			truncate = 0;
+			iput(inode);
+			continue;
+		}
+
+		if (truncate) {
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+			/* not completed truncate found. New size was committed together
+			   with "save" link */
+			reiserfs_info(s, "Truncating %k to %Ld ..",
+				      INODE_PKEY(inode), inode->i_size);
+			reiserfs_truncate_file(inode,
+					       0
+					       /*don't update modification time */
+					       );
+			retval = remove_save_link(inode, truncate);
+		} else {
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+			/* not completed unlink (rmdir) found */
+			reiserfs_info(s, "Removing %k..", INODE_PKEY(inode));
+			/* removal gets completed in iput */
+			retval = 0;
+		}
+
+		iput(inode);
+		printk("done\n");
+		done++;
+	}
+	REISERFS_SB(s)->s_is_unlinked_ok = 0;
+
 #ifdef CONFIG_QUOTA
-    /* Turn quotas off */
-    for (i = 0; i < MAXQUOTAS; i++) {
-            if (sb_dqopt(s)->files[i])
-                    vfs_quota_off_mount(s, i);
-    }
-    if (ms_active_set)
-	    /* Restore the flag back */
-	    s->s_flags &= ~MS_ACTIVE;
+	/* Turn quotas off */
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (sb_dqopt(s)->files[i])
+			vfs_quota_off_mount(s, i);
+	}
+	if (ms_active_set)
+		/* Restore the flag back */
+		s->s_flags &= ~MS_ACTIVE;
 #endif
-    pathrelse (&path);
-    if (done)
-        reiserfs_info (s, "There were %d uncompleted unlinks/truncates. "
-                          "Completed\n", done);
-    return retval;
+	pathrelse(&path);
+	if (done)
+		reiserfs_info(s, "There were %d uncompleted unlinks/truncates. "
+			      "Completed\n", done);
+	return retval;
 }
- 
+
 /* to protect file being unlinked from getting lost we "safe" link files
    being unlinked. This link will be deleted in the same transaction with last
    item of file. mounting the filesytem we scan all these links and remove
    files which almost got lost */
-void add_save_link (struct reiserfs_transaction_handle * th,
-		    struct inode * inode, int truncate)
+void add_save_link(struct reiserfs_transaction_handle *th,
+		   struct inode *inode, int truncate)
 {
-    INITIALIZE_PATH (path);
-    int retval;
-    struct cpu_key key;
-    struct item_head ih;
-    __le32 link;
-
-    BUG_ON (!th->t_trans_id);
-
-    /* file can only get one "save link" of each kind */
-    RFALSE( truncate && 
-	    ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ),
-	    "saved link already exists for truncated inode %lx",
-	    ( long ) inode -> i_ino );
-    RFALSE( !truncate && 
-	    ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ),
-	    "saved link already exists for unlinked inode %lx",
-	    ( long ) inode -> i_ino );
-
-    /* setup key of "save" link */
-    key.version = KEY_FORMAT_3_5;
-    key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
-    key.on_disk_key.k_objectid = inode->i_ino;
-    if (!truncate) {
-	/* unlink, rmdir, rename */
-	set_cpu_key_k_offset (&key, 1 + inode->i_sb->s_blocksize);
-	set_cpu_key_k_type (&key, TYPE_DIRECT);
-
-	/* item head of "safe" link */
-	make_le_item_head (&ih, &key, key.version, 1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
-			   4/*length*/, 0xffff/*free space*/);
-    } else {
-	/* truncate */
-	if (S_ISDIR (inode->i_mode))
-	    reiserfs_warning(inode->i_sb, "green-2102: Adding a truncate savelink for a directory %k! Please report", INODE_PKEY(inode));
-	set_cpu_key_k_offset (&key, 1);
-	set_cpu_key_k_type (&key, TYPE_INDIRECT);
-
-	/* item head of "safe" link */
-	make_le_item_head (&ih, &key, key.version, 1, TYPE_INDIRECT,
-			   4/*length*/, 0/*free space*/);
-    }
-    key.key_length = 3;
-
-    /* look for its place in the tree */
-    retval = search_item (inode->i_sb, &key, &path);
-    if (retval != ITEM_NOT_FOUND) {
-	if ( retval != -ENOSPC )
-	    reiserfs_warning (inode->i_sb, "vs-2100: add_save_link:"
-			  "search_by_key (%K) returned %d", &key, retval);
-	pathrelse (&path);
-	return;
-    }
-
-    /* body of "save" link */
-    link = INODE_PKEY (inode)->k_dir_id;
-
-    /* put "save" link inot tree, don't charge quota to anyone */
-    retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link);
-    if (retval) {
-	if (retval != -ENOSPC)
-	    reiserfs_warning (inode->i_sb, "vs-2120: add_save_link: insert_item returned %d",
-			  retval);
-    } else {
-	if( truncate )
-	    REISERFS_I(inode) -> i_flags |= i_link_saved_truncate_mask;
-	else
-	    REISERFS_I(inode) -> i_flags |= i_link_saved_unlink_mask;
-    }
-}
+	INITIALIZE_PATH(path);
+	int retval;
+	struct cpu_key key;
+	struct item_head ih;
+	__le32 link;
+
+	BUG_ON(!th->t_trans_id);
+
+	/* file can only get one "save link" of each kind */
+	RFALSE(truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask),
+	       "saved link already exists for truncated inode %lx",
+	       (long)inode->i_ino);
+	RFALSE(!truncate &&
+	       (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask),
+	       "saved link already exists for unlinked inode %lx",
+	       (long)inode->i_ino);
+
+	/* setup key of "save" link */
+	key.version = KEY_FORMAT_3_5;
+	key.on_disk_key.k_dir_id = MAX_KEY_OBJECTID;
+	key.on_disk_key.k_objectid = inode->i_ino;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_cpu_key_k_offset(&key, 1 + inode->i_sb->s_blocksize);
+		set_cpu_key_k_type(&key, TYPE_DIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version,
+				  1 + inode->i_sb->s_blocksize, TYPE_DIRECT,
+				  4 /*length */ , 0xffff /*free space */ );
+	} else {
+		/* truncate */
+		if (S_ISDIR(inode->i_mode))
+			reiserfs_warning(inode->i_sb,
+					 "green-2102: Adding a truncate savelink for a directory %k! Please report",
+					 INODE_PKEY(inode));
+		set_cpu_key_k_offset(&key, 1);
+		set_cpu_key_k_type(&key, TYPE_INDIRECT);
+
+		/* item head of "safe" link */
+		make_le_item_head(&ih, &key, key.version, 1, TYPE_INDIRECT,
+				  4 /*length */ , 0 /*free space */ );
+	}
+	key.key_length = 3;
+
+	/* look for its place in the tree */
+	retval = search_item(inode->i_sb, &key, &path);
+	if (retval != ITEM_NOT_FOUND) {
+		if (retval != -ENOSPC)
+			reiserfs_warning(inode->i_sb, "vs-2100: add_save_link:"
+					 "search_by_key (%K) returned %d", &key,
+					 retval);
+		pathrelse(&path);
+		return;
+	}
 
+	/* body of "save" link */
+	link = INODE_PKEY(inode)->k_dir_id;
+
+	/* put "save" link inot tree, don't charge quota to anyone */
+	retval =
+	    reiserfs_insert_item(th, &path, &key, &ih, NULL, (char *)&link);
+	if (retval) {
+		if (retval != -ENOSPC)
+			reiserfs_warning(inode->i_sb,
+					 "vs-2120: add_save_link: insert_item returned %d",
+					 retval);
+	} else {
+		if (truncate)
+			REISERFS_I(inode)->i_flags |=
+			    i_link_saved_truncate_mask;
+		else
+			REISERFS_I(inode)->i_flags |= i_link_saved_unlink_mask;
+	}
+}
 
 /* this opens transaction unlike add_save_link */
-int remove_save_link (struct inode * inode, int truncate)
+int remove_save_link(struct inode *inode, int truncate)
 {
-    struct reiserfs_transaction_handle th;
-    struct reiserfs_key key;
-    int err;
- 
-    /* we are going to do one balancing only */
-    err = journal_begin (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
-    if (err)
-        return err;
- 
-    /* setup key of "save" link */
-    key.k_dir_id = cpu_to_le32 (MAX_KEY_OBJECTID);
-    key.k_objectid = INODE_PKEY (inode)->k_objectid;
-    if (!truncate) {
-        /* unlink, rmdir, rename */
-        set_le_key_k_offset (KEY_FORMAT_3_5, &key,
-			     1 + inode->i_sb->s_blocksize);
-        set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_DIRECT);
-    } else {
-        /* truncate */
-        set_le_key_k_offset (KEY_FORMAT_3_5, &key, 1);
-        set_le_key_k_type (KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
-    }
- 
-    if( ( truncate && 
-          ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) ||
-        ( !truncate && 
-          ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) )
-	/* don't take quota bytes from anywhere */
-	reiserfs_delete_solid_item (&th, NULL, &key);
-    if (!truncate) {
-	reiserfs_release_objectid (&th, inode->i_ino);
-	REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask;
-    } else
-	REISERFS_I(inode) -> i_flags &= ~i_link_saved_truncate_mask;
- 
-    return journal_end (&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
-}
+	struct reiserfs_transaction_handle th;
+	struct reiserfs_key key;
+	int err;
+
+	/* we are going to do one balancing only */
+	err = journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+	if (err)
+		return err;
+
+	/* setup key of "save" link */
+	key.k_dir_id = cpu_to_le32(MAX_KEY_OBJECTID);
+	key.k_objectid = INODE_PKEY(inode)->k_objectid;
+	if (!truncate) {
+		/* unlink, rmdir, rename */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key,
+				    1 + inode->i_sb->s_blocksize);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_DIRECT);
+	} else {
+		/* truncate */
+		set_le_key_k_offset(KEY_FORMAT_3_5, &key, 1);
+		set_le_key_k_type(KEY_FORMAT_3_5, &key, TYPE_INDIRECT);
+	}
 
+	if ((truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_truncate_mask)) ||
+	    (!truncate &&
+	     (REISERFS_I(inode)->i_flags & i_link_saved_unlink_mask)))
+		/* don't take quota bytes from anywhere */
+		reiserfs_delete_solid_item(&th, NULL, &key);
+	if (!truncate) {
+		reiserfs_release_objectid(&th, inode->i_ino);
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_unlink_mask;
+	} else
+		REISERFS_I(inode)->i_flags &= ~i_link_saved_truncate_mask;
+
+	return journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT);
+}
 
-static void reiserfs_put_super (struct super_block * s)
+static void reiserfs_put_super(struct super_block *s)
 {
-  int i;
-  struct reiserfs_transaction_handle th ;
-  th.t_trans_id = 0;
-
-  if (REISERFS_SB(s)->xattr_root) {
-    d_invalidate (REISERFS_SB(s)->xattr_root);
-    dput (REISERFS_SB(s)->xattr_root);
-  }
-  
-  if (REISERFS_SB(s)->priv_root) {
-    d_invalidate (REISERFS_SB(s)->priv_root);
-    dput (REISERFS_SB(s)->priv_root);
-  }
-
-  /* change file system state to current state if it was mounted with read-write permissions */
-  if (!(s->s_flags & MS_RDONLY)) {
-    if (!journal_begin(&th, s, 10)) {
-        reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-        set_sb_umount_state( SB_DISK_SUPER_BLOCK(s), REISERFS_SB(s)->s_mount_state );
-        journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    }
-  }
-
-  /* note, journal_release checks for readonly mount, and can decide not
-  ** to do a journal_end
-  */
-  journal_release(&th, s) ;
-
-  for (i = 0; i < SB_BMAP_NR (s); i ++)
-    brelse (SB_AP_BITMAP (s)[i].bh);
-
-  vfree (SB_AP_BITMAP (s));
-
-  brelse (SB_BUFFER_WITH_SB (s));
-
-  print_statistics (s);
-
-  if (REISERFS_SB(s)->s_kmallocs != 0) {
-    reiserfs_warning (s, "vs-2004: reiserfs_put_super: allocated memory left %d",
-		      REISERFS_SB(s)->s_kmallocs);
-  }
-
-  if (REISERFS_SB(s)->reserved_blocks != 0) {
-    reiserfs_warning (s, "green-2005: reiserfs_put_super: reserved blocks left %d",
-		      REISERFS_SB(s)->reserved_blocks);
-  }
-
-  reiserfs_proc_info_done( s );
-
-  kfree(s->s_fs_info);
-  s->s_fs_info = NULL;
-
-  return;
+	int i;
+	struct reiserfs_transaction_handle th;
+	th.t_trans_id = 0;
+
+	if (REISERFS_SB(s)->xattr_root) {
+		d_invalidate(REISERFS_SB(s)->xattr_root);
+		dput(REISERFS_SB(s)->xattr_root);
+	}
+
+	if (REISERFS_SB(s)->priv_root) {
+		d_invalidate(REISERFS_SB(s)->priv_root);
+		dput(REISERFS_SB(s)->priv_root);
+	}
+
+	/* change file system state to current state if it was mounted with read-write permissions */
+	if (!(s->s_flags & MS_RDONLY)) {
+		if (!journal_begin(&th, s, 10)) {
+			reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s),
+						     1);
+			set_sb_umount_state(SB_DISK_SUPER_BLOCK(s),
+					    REISERFS_SB(s)->s_mount_state);
+			journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		}
+	}
+
+	/* note, journal_release checks for readonly mount, and can decide not
+	 ** to do a journal_end
+	 */
+	journal_release(&th, s);
+
+	for (i = 0; i < SB_BMAP_NR(s); i++)
+		brelse(SB_AP_BITMAP(s)[i].bh);
+
+	vfree(SB_AP_BITMAP(s));
+
+	brelse(SB_BUFFER_WITH_SB(s));
+
+	print_statistics(s);
+
+	if (REISERFS_SB(s)->s_kmallocs != 0) {
+		reiserfs_warning(s,
+				 "vs-2004: reiserfs_put_super: allocated memory left %d",
+				 REISERFS_SB(s)->s_kmallocs);
+	}
+
+	if (REISERFS_SB(s)->reserved_blocks != 0) {
+		reiserfs_warning(s,
+				 "green-2005: reiserfs_put_super: reserved blocks left %d",
+				 REISERFS_SB(s)->reserved_blocks);
+	}
+
+	reiserfs_proc_info_done(s);
+
+	kfree(s->s_fs_info);
+	s->s_fs_info = NULL;
+
+	return;
 }
 
-static kmem_cache_t * reiserfs_inode_cachep;
+static kmem_cache_t *reiserfs_inode_cachep;
 
 static struct inode *reiserfs_alloc_inode(struct super_block *sb)
 {
 	struct reiserfs_inode_info *ei;
-	ei = (struct reiserfs_inode_info *)kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL);
+	ei = (struct reiserfs_inode_info *)
+	    kmem_cache_alloc(reiserfs_inode_cachep, SLAB_KERNEL);
 	if (!ei)
 		return NULL;
 	return &ei->vfs_inode;
@@ -488,25 +509,26 @@ static void reiserfs_destroy_inode(struct inode *inode)
 	kmem_cache_free(reiserfs_inode_cachep, REISERFS_I(inode));
 }
 
-static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
+static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
 {
-	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *) foo;
+	struct reiserfs_inode_info *ei = (struct reiserfs_inode_info *)foo;
 
-	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
-		INIT_LIST_HEAD(&ei->i_prealloc_list) ;
+		INIT_LIST_HEAD(&ei->i_prealloc_list);
 		inode_init_once(&ei->vfs_inode);
 		ei->i_acl_access = NULL;
 		ei->i_acl_default = NULL;
 	}
 }
- 
+
 static int init_inodecache(void)
 {
 	reiserfs_inode_cachep = kmem_cache_create("reiser_inode_cache",
-					     sizeof(struct reiserfs_inode_info),
-					     0, SLAB_RECLAIM_ACCOUNT,
-					     init_once, NULL);
+						  sizeof(struct
+							 reiserfs_inode_info),
+						  0, SLAB_RECLAIM_ACCOUNT,
+						  init_once, NULL);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -515,72 +537,76 @@ static int init_inodecache(void)
 static void destroy_inodecache(void)
 {
 	if (kmem_cache_destroy(reiserfs_inode_cachep))
-		reiserfs_warning (NULL, "reiserfs_inode_cache: not all structures were freed");
+		reiserfs_warning(NULL,
+				 "reiserfs_inode_cache: not all structures were freed");
 }
 
 /* we don't mark inodes dirty, we just log them */
-static void reiserfs_dirty_inode (struct inode * inode) {
-    struct reiserfs_transaction_handle th ;
-
-    int err = 0;
-    if (inode->i_sb->s_flags & MS_RDONLY) {
-        reiserfs_warning(inode->i_sb, "clm-6006: writing inode %lu on readonly FS",
-	                  inode->i_ino) ;
-        return ;
-    }
-    reiserfs_write_lock(inode->i_sb);
-
-    /* this is really only used for atime updates, so they don't have
-    ** to be included in O_SYNC or fsync
-    */
-    err = journal_begin(&th, inode->i_sb, 1) ;
-    if (err) {
-        reiserfs_write_unlock (inode->i_sb);
-        return;
-    }
-    reiserfs_update_sd (&th, inode);
-    journal_end(&th, inode->i_sb, 1) ;
-    reiserfs_write_unlock(inode->i_sb);
+static void reiserfs_dirty_inode(struct inode *inode)
+{
+	struct reiserfs_transaction_handle th;
+
+	int err = 0;
+	if (inode->i_sb->s_flags & MS_RDONLY) {
+		reiserfs_warning(inode->i_sb,
+				 "clm-6006: writing inode %lu on readonly FS",
+				 inode->i_ino);
+		return;
+	}
+	reiserfs_write_lock(inode->i_sb);
+
+	/* this is really only used for atime updates, so they don't have
+	 ** to be included in O_SYNC or fsync
+	 */
+	err = journal_begin(&th, inode->i_sb, 1);
+	if (err) {
+		reiserfs_write_unlock(inode->i_sb);
+		return;
+	}
+	reiserfs_update_sd(&th, inode);
+	journal_end(&th, inode->i_sb, 1);
+	reiserfs_write_unlock(inode->i_sb);
 }
 
-static void reiserfs_clear_inode (struct inode *inode)
+static void reiserfs_clear_inode(struct inode *inode)
 {
-    struct posix_acl *acl;
+	struct posix_acl *acl;
 
-    acl = REISERFS_I(inode)->i_acl_access;
-    if (acl && !IS_ERR (acl))
-        posix_acl_release (acl);
-    REISERFS_I(inode)->i_acl_access = NULL;
+	acl = REISERFS_I(inode)->i_acl_access;
+	if (acl && !IS_ERR(acl))
+		posix_acl_release(acl);
+	REISERFS_I(inode)->i_acl_access = NULL;
 
-    acl = REISERFS_I(inode)->i_acl_default;
-    if (acl && !IS_ERR (acl))
-        posix_acl_release (acl);
-    REISERFS_I(inode)->i_acl_default = NULL;
+	acl = REISERFS_I(inode)->i_acl_default;
+	if (acl && !IS_ERR(acl))
+		posix_acl_release(acl);
+	REISERFS_I(inode)->i_acl_default = NULL;
 }
 
 #ifdef CONFIG_QUOTA
-static ssize_t reiserfs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t, loff_t);
+static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
+				    size_t, loff_t);
+static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
+				   loff_t);
 #endif
 
-static struct super_operations reiserfs_sops =
-{
-  .alloc_inode = reiserfs_alloc_inode,
-  .destroy_inode = reiserfs_destroy_inode,
-  .write_inode = reiserfs_write_inode,
-  .dirty_inode = reiserfs_dirty_inode,
-  .delete_inode = reiserfs_delete_inode,
-  .clear_inode  = reiserfs_clear_inode,
-  .put_super = reiserfs_put_super,
-  .write_super = reiserfs_write_super,
-  .sync_fs = reiserfs_sync_fs,
-  .write_super_lockfs = reiserfs_write_super_lockfs,
-  .unlockfs = reiserfs_unlockfs,
-  .statfs = reiserfs_statfs,
-  .remount_fs = reiserfs_remount,
+static struct super_operations reiserfs_sops = {
+	.alloc_inode = reiserfs_alloc_inode,
+	.destroy_inode = reiserfs_destroy_inode,
+	.write_inode = reiserfs_write_inode,
+	.dirty_inode = reiserfs_dirty_inode,
+	.delete_inode = reiserfs_delete_inode,
+	.clear_inode = reiserfs_clear_inode,
+	.put_super = reiserfs_put_super,
+	.write_super = reiserfs_write_super,
+	.sync_fs = reiserfs_sync_fs,
+	.write_super_lockfs = reiserfs_write_super_lockfs,
+	.unlockfs = reiserfs_unlockfs,
+	.statfs = reiserfs_statfs,
+	.remount_fs = reiserfs_remount,
 #ifdef CONFIG_QUOTA
-  .quota_read = reiserfs_quota_read,
-  .quota_write = reiserfs_quota_write,
+	.quota_read = reiserfs_quota_read,
+	.quota_write = reiserfs_quota_write,
 #endif
 };
 
@@ -596,50 +622,48 @@ static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
 static int reiserfs_quota_on(struct super_block *, int, int, char *);
 
-static struct dquot_operations reiserfs_quota_operations =
-{
-  .initialize = reiserfs_dquot_initialize,
-  .drop = reiserfs_dquot_drop,
-  .alloc_space = dquot_alloc_space,
-  .alloc_inode = dquot_alloc_inode,
-  .free_space = dquot_free_space,
-  .free_inode = dquot_free_inode,
-  .transfer = dquot_transfer,
-  .write_dquot = reiserfs_write_dquot,
-  .acquire_dquot = reiserfs_acquire_dquot,
-  .release_dquot = reiserfs_release_dquot,
-  .mark_dirty = reiserfs_mark_dquot_dirty,
-  .write_info = reiserfs_write_info,
+static struct dquot_operations reiserfs_quota_operations = {
+	.initialize = reiserfs_dquot_initialize,
+	.drop = reiserfs_dquot_drop,
+	.alloc_space = dquot_alloc_space,
+	.alloc_inode = dquot_alloc_inode,
+	.free_space = dquot_free_space,
+	.free_inode = dquot_free_inode,
+	.transfer = dquot_transfer,
+	.write_dquot = reiserfs_write_dquot,
+	.acquire_dquot = reiserfs_acquire_dquot,
+	.release_dquot = reiserfs_release_dquot,
+	.mark_dirty = reiserfs_mark_dquot_dirty,
+	.write_info = reiserfs_write_info,
 };
 
-static struct quotactl_ops reiserfs_qctl_operations =
-{
-  .quota_on = reiserfs_quota_on,
-  .quota_off = vfs_quota_off,
-  .quota_sync = vfs_quota_sync,
-  .get_info = vfs_get_dqinfo,
-  .set_info = vfs_set_dqinfo,
-  .get_dqblk = vfs_get_dqblk,
-  .set_dqblk = vfs_set_dqblk,
+static struct quotactl_ops reiserfs_qctl_operations = {
+	.quota_on = reiserfs_quota_on,
+	.quota_off = vfs_quota_off,
+	.quota_sync = vfs_quota_sync,
+	.get_info = vfs_get_dqinfo,
+	.set_info = vfs_set_dqinfo,
+	.get_dqblk = vfs_get_dqblk,
+	.set_dqblk = vfs_set_dqblk,
 };
 #endif
 
 static struct export_operations reiserfs_export_ops = {
-  .encode_fh = reiserfs_encode_fh,
-  .decode_fh = reiserfs_decode_fh,
-  .get_parent = reiserfs_get_parent,
-  .get_dentry = reiserfs_get_dentry,
-} ;
+	.encode_fh = reiserfs_encode_fh,
+	.decode_fh = reiserfs_decode_fh,
+	.get_parent = reiserfs_get_parent,
+	.get_dentry = reiserfs_get_dentry,
+};
 
 /* this struct is used in reiserfs_getopt () for containing the value for those
    mount options that have values rather than being toggles. */
 typedef struct {
-    char * value;
-    int setmask; /* bitmask which is to set on mount_options bitmask when this
-                    value is found, 0 is no bits are to be changed. */
-    int clrmask; /* bitmask which is to clear on mount_options bitmask when  this
-		    value is found, 0 is no bits are to be changed. This is
-		    applied BEFORE setmask */
+	char *value;
+	int setmask;		/* bitmask which is to set on mount_options bitmask when this
+				   value is found, 0 is no bits are to be changed. */
+	int clrmask;		/* bitmask which is to clear on mount_options bitmask when  this
+				   value is found, 0 is no bits are to be changed. This is
+				   applied BEFORE setmask */
 } arg_desc_t;
 
 /* Set this bit in arg_required to allow empty arguments */
@@ -648,67 +672,70 @@ typedef struct {
 /* this struct is used in reiserfs_getopt() for describing the set of reiserfs
    mount options */
 typedef struct {
-    char * option_name;
-    int arg_required; /* 0 if argument is not required, not 0 otherwise */
-    const arg_desc_t * values; /* list of values accepted by an option */
-    int setmask; /* bitmask which is to set on mount_options bitmask when this
-                    value is found, 0 is no bits are to be changed. */
-    int clrmask; /* bitmask which is to clear on mount_options bitmask when  this
-		    value is found, 0 is no bits are to be changed. This is
-		    applied BEFORE setmask */
+	char *option_name;
+	int arg_required;	/* 0 if argument is not required, not 0 otherwise */
+	const arg_desc_t *values;	/* list of values accepted by an option */
+	int setmask;		/* bitmask which is to set on mount_options bitmask when this
+				   value is found, 0 is no bits are to be changed. */
+	int clrmask;		/* bitmask which is to clear on mount_options bitmask when  this
+				   value is found, 0 is no bits are to be changed. This is
+				   applied BEFORE setmask */
 } opt_desc_t;
 
 /* possible values for -o data= */
 static const arg_desc_t logging_mode[] = {
-    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
-    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
-    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
-    {NULL, 0}
+	{"ordered", 1 << REISERFS_DATA_ORDERED,
+	 (1 << REISERFS_DATA_LOG | 1 << REISERFS_DATA_WRITEBACK)},
+	{"journal", 1 << REISERFS_DATA_LOG,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_WRITEBACK)},
+	{"writeback", 1 << REISERFS_DATA_WRITEBACK,
+	 (1 << REISERFS_DATA_ORDERED | 1 << REISERFS_DATA_LOG)},
+	{NULL, 0}
 };
 
 /* possible values for -o barrier= */
 static const arg_desc_t barrier_mode[] = {
-    {"none", 1<<REISERFS_BARRIER_NONE, 1<<REISERFS_BARRIER_FLUSH},
-    {"flush", 1<<REISERFS_BARRIER_FLUSH, 1<<REISERFS_BARRIER_NONE},
-    {NULL, 0}
+	{"none", 1 << REISERFS_BARRIER_NONE, 1 << REISERFS_BARRIER_FLUSH},
+	{"flush", 1 << REISERFS_BARRIER_FLUSH, 1 << REISERFS_BARRIER_NONE},
+	{NULL, 0}
 };
 
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
-    {"noborder", 1<<REISERFS_NO_BORDER, 0},
-    {"border", 0, 1<<REISERFS_NO_BORDER},
-    {"no_unhashed_relocation", 1<<REISERFS_NO_UNHASHED_RELOCATION, 0},
-    {"hashed_relocation", 1<<REISERFS_HASHED_RELOCATION, 0},
-    {"test4", 1<<REISERFS_TEST4, 0},
-    {"notest4", 0, 1<<REISERFS_TEST4},
-    {NULL, 0, 0}
+	{"noborder", 1 << REISERFS_NO_BORDER, 0},
+	{"border", 0, 1 << REISERFS_NO_BORDER},
+	{"no_unhashed_relocation", 1 << REISERFS_NO_UNHASHED_RELOCATION, 0},
+	{"hashed_relocation", 1 << REISERFS_HASHED_RELOCATION, 0},
+	{"test4", 1 << REISERFS_TEST4, 0},
+	{"notest4", 0, 1 << REISERFS_TEST4},
+	{NULL, 0, 0}
 };
 
 static const arg_desc_t tails[] = {
-    {"on", 1<<REISERFS_LARGETAIL, 1<<REISERFS_SMALLTAIL},
-    {"off", 0, (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)},
-    {"small", 1<<REISERFS_SMALLTAIL, 1<<REISERFS_LARGETAIL},
-    {NULL, 0, 0}
+	{"on", 1 << REISERFS_LARGETAIL, 1 << REISERFS_SMALLTAIL},
+	{"off", 0, (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+	{"small", 1 << REISERFS_SMALLTAIL, 1 << REISERFS_LARGETAIL},
+	{NULL, 0, 0}
 };
 
 static const arg_desc_t error_actions[] = {
-    {"panic", 1 << REISERFS_ERROR_PANIC,
-              (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
-    {"ro-remount", 1 << REISERFS_ERROR_RO,
-              (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
+	{"panic", 1 << REISERFS_ERROR_PANIC,
+	 (1 << REISERFS_ERROR_RO | 1 << REISERFS_ERROR_CONTINUE)},
+	{"ro-remount", 1 << REISERFS_ERROR_RO,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_CONTINUE)},
 #ifdef REISERFS_JOURNAL_ERROR_ALLOWS_NO_LOG
-    {"continue", 1 << REISERFS_ERROR_CONTINUE,
-              (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
+	{"continue", 1 << REISERFS_ERROR_CONTINUE,
+	 (1 << REISERFS_ERROR_PANIC | 1 << REISERFS_ERROR_RO)},
 #endif
-    {NULL, 0, 0},
+	{NULL, 0, 0},
 };
 
-int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 128k.
-					      There might be broken applications that are
-					      confused by this. Use nolargeio mount option
-					      to get usual i/o size = PAGE_SIZE.
-					    */
+int reiserfs_default_io_size = 128 * 1024;	/* Default recommended I/O size is 128k.
+						   There might be broken applications that are
+						   confused by this. Use nolargeio mount option
+						   to get usual i/o size = PAGE_SIZE.
+						 */
 
 /* proceed only one option from a list *cur - string containing of mount options
    opts - array of options which are accepted
@@ -716,486 +743,530 @@ int reiserfs_default_io_size = 128 * 1024; /* Default recommended I/O size is 12
    in the input - pointer to the argument is stored here
    bit_flags - if option requires to set a certain bit - it is set here
    return -1 if unknown option is found, opt->arg_required otherwise */
-static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * opts, char ** opt_arg,
-			    unsigned long * bit_flags)
+static int reiserfs_getopt(struct super_block *s, char **cur, opt_desc_t * opts,
+			   char **opt_arg, unsigned long *bit_flags)
 {
-    char * p;
-    /* foo=bar, 
-       ^   ^  ^
-       |   |  +-- option_end
-       |   +-- arg_start
-       +-- option_start
-    */
-    const opt_desc_t * opt;
-    const arg_desc_t * arg;
-    
-    
-    p = *cur;
-    
-    /* assume argument cannot contain commas */
-    *cur = strchr (p, ',');
-    if (*cur) {
-	*(*cur) = '\0';
-	(*cur) ++;
-    }
-
-    if ( !strncmp (p, "alloc=", 6) ) {
-	/* Ugly special case, probably we should redo options parser so that
-	   it can understand several arguments for some options, also so that
-	   it can fill several bitfields with option values. */
-	if ( reiserfs_parse_alloc_options( s, p + 6) ) {
-	    return -1;
-	} else {
-	    return 0;
-	}
-    }
-
- 
-    /* for every option in the list */
-    for (opt = opts; opt->option_name; opt ++) {
-	if (!strncmp (p, opt->option_name, strlen (opt->option_name))) {
-	    if (bit_flags) {
-                if (opt->clrmask == (1 << REISERFS_UNSUPPORTED_OPT))
-                    reiserfs_warning (s, "%s not supported.", p);
-                else
-                    *bit_flags &= ~opt->clrmask;
-                if (opt->setmask == (1 << REISERFS_UNSUPPORTED_OPT))
-                    reiserfs_warning (s, "%s not supported.", p);
-                else
-                    *bit_flags |= opt->setmask;
-	    }
-	    break;
-	}
-    }
-    if (!opt->option_name) {
-	reiserfs_warning (s, "unknown mount option \"%s\"", p);
-	return -1;
-    }
-    
-    p += strlen (opt->option_name);
-    switch (*p) {
-    case '=':
-	if (!opt->arg_required) {
-	    reiserfs_warning (s, "the option \"%s\" does not require an argument",
-		    opt->option_name);
-	    return -1;
-	}
-	break;
-	
-    case 0:
-	if (opt->arg_required) {
-	    reiserfs_warning (s, "the option \"%s\" requires an argument", opt->option_name);
-	    return -1;
-	}
-	break;
-    default:
-	reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name);
-	return -1;
-    }
-
-    /* move to the argument, or to next option if argument is not required */
-    p ++;
-    
-    if ( opt->arg_required && !(opt->arg_required & (1<<REISERFS_OPT_ALLOWEMPTY)) && !strlen (p) ) {
-	/* this catches "option=," if not allowed */
-	reiserfs_warning (s, "empty argument for \"%s\"", opt->option_name);
+	char *p;
+	/* foo=bar, 
+	   ^   ^  ^
+	   |   |  +-- option_end
+	   |   +-- arg_start
+	   +-- option_start
+	 */
+	const opt_desc_t *opt;
+	const arg_desc_t *arg;
+
+	p = *cur;
+
+	/* assume argument cannot contain commas */
+	*cur = strchr(p, ',');
+	if (*cur) {
+		*(*cur) = '\0';
+		(*cur)++;
+	}
+
+	if (!strncmp(p, "alloc=", 6)) {
+		/* Ugly special case, probably we should redo options parser so that
+		   it can understand several arguments for some options, also so that
+		   it can fill several bitfields with option values. */
+		if (reiserfs_parse_alloc_options(s, p + 6)) {
+			return -1;
+		} else {
+			return 0;
+		}
+	}
+
+	/* for every option in the list */
+	for (opt = opts; opt->option_name; opt++) {
+		if (!strncmp(p, opt->option_name, strlen(opt->option_name))) {
+			if (bit_flags) {
+				if (opt->clrmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "%s not supported.",
+							 p);
+				else
+					*bit_flags &= ~opt->clrmask;
+				if (opt->setmask ==
+				    (1 << REISERFS_UNSUPPORTED_OPT))
+					reiserfs_warning(s, "%s not supported.",
+							 p);
+				else
+					*bit_flags |= opt->setmask;
+			}
+			break;
+		}
+	}
+	if (!opt->option_name) {
+		reiserfs_warning(s, "unknown mount option \"%s\"", p);
+		return -1;
+	}
+
+	p += strlen(opt->option_name);
+	switch (*p) {
+	case '=':
+		if (!opt->arg_required) {
+			reiserfs_warning(s,
+					 "the option \"%s\" does not require an argument",
+					 opt->option_name);
+			return -1;
+		}
+		break;
+
+	case 0:
+		if (opt->arg_required) {
+			reiserfs_warning(s,
+					 "the option \"%s\" requires an argument",
+					 opt->option_name);
+			return -1;
+		}
+		break;
+	default:
+		reiserfs_warning(s, "head of option \"%s\" is only correct",
+				 opt->option_name);
+		return -1;
+	}
+
+	/* move to the argument, or to next option if argument is not required */
+	p++;
+
+	if (opt->arg_required
+	    && !(opt->arg_required & (1 << REISERFS_OPT_ALLOWEMPTY))
+	    && !strlen(p)) {
+		/* this catches "option=," if not allowed */
+		reiserfs_warning(s, "empty argument for \"%s\"",
+				 opt->option_name);
+		return -1;
+	}
+
+	if (!opt->values) {
+		/* *=NULLopt_arg contains pointer to argument */
+		*opt_arg = p;
+		return opt->arg_required & ~(1 << REISERFS_OPT_ALLOWEMPTY);
+	}
+
+	/* values possible for this option are listed in opt->values */
+	for (arg = opt->values; arg->value; arg++) {
+		if (!strcmp(p, arg->value)) {
+			if (bit_flags) {
+				*bit_flags &= ~arg->clrmask;
+				*bit_flags |= arg->setmask;
+			}
+			return opt->arg_required;
+		}
+	}
+
+	reiserfs_warning(s, "bad value \"%s\" for option \"%s\"", p,
+			 opt->option_name);
 	return -1;
-    }
-    
-    if (!opt->values) {
-	/* *=NULLopt_arg contains pointer to argument */
-	*opt_arg = p;
-	return opt->arg_required & ~(1<<REISERFS_OPT_ALLOWEMPTY);
-    }
-    
-    /* values possible for this option are listed in opt->values */
-    for (arg = opt->values; arg->value; arg ++) {
-	if (!strcmp (p, arg->value)) {
-	    if (bit_flags) {
-		*bit_flags &= ~arg->clrmask;
-		*bit_flags |= arg->setmask;
-	    }
-	    return opt->arg_required;
-	}
-    }
-    
-    reiserfs_warning (s, "bad value \"%s\" for option \"%s\"", p, opt->option_name);
-    return -1;
 }
 
 /* returns 0 if something is wrong in option string, 1 - otherwise */
-static int reiserfs_parse_options (struct super_block * s, char * options, /* string given via mount's -o */
-				   unsigned long * mount_options,
-				   /* after the parsing phase, contains the
-				      collection of bitflags defining what
-				      mount options were selected. */
-				   unsigned long * blocks, /* strtol-ed from NNN of resize=NNN */
-				   char ** jdev_name,
-				   unsigned int * commit_max_age)
+static int reiserfs_parse_options(struct super_block *s, char *options,	/* string given via mount's -o */
+				  unsigned long *mount_options,
+				  /* after the parsing phase, contains the
+				     collection of bitflags defining what
+				     mount options were selected. */
+				  unsigned long *blocks,	/* strtol-ed from NNN of resize=NNN */
+				  char **jdev_name,
+				  unsigned int *commit_max_age)
 {
-    int c;
-    char * arg = NULL;
-    char * pos;
-    opt_desc_t opts[] = {
-	/* Compatibility stuff, so that -o notail for old setups still work */
-	{"tails",	.arg_required = 't', .values = tails},
-	{"notail",	.clrmask = (1<<REISERFS_LARGETAIL)|(1<<REISERFS_SMALLTAIL)},
-	{"conv",	.setmask = 1<<REISERFS_CONVERT},
-	{"attrs",	.setmask = 1<<REISERFS_ATTRS},
-	{"noattrs",	.clrmask = 1<<REISERFS_ATTRS},
+	int c;
+	char *arg = NULL;
+	char *pos;
+	opt_desc_t opts[] = {
+		/* Compatibility stuff, so that -o notail for old setups still work */
+		{"tails",.arg_required = 't',.values = tails},
+		{"notail",.clrmask =
+		 (1 << REISERFS_LARGETAIL) | (1 << REISERFS_SMALLTAIL)},
+		{"conv",.setmask = 1 << REISERFS_CONVERT},
+		{"attrs",.setmask = 1 << REISERFS_ATTRS},
+		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
 #ifdef CONFIG_REISERFS_FS_XATTR
-	{"user_xattr",	.setmask = 1<<REISERFS_XATTRS_USER},
-	{"nouser_xattr",.clrmask = 1<<REISERFS_XATTRS_USER},
+		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
 #else
-	{"user_xattr",	.setmask = 1<<REISERFS_UNSUPPORTED_OPT},
-	{"nouser_xattr",.clrmask = 1<<REISERFS_UNSUPPORTED_OPT},
+		{"user_xattr",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"nouser_xattr",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
 #endif
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-	{"acl",		.setmask = 1<<REISERFS_POSIXACL},
-	{"noacl",	.clrmask = 1<<REISERFS_POSIXACL},
+		{"acl",.setmask = 1 << REISERFS_POSIXACL},
+		{"noacl",.clrmask = 1 << REISERFS_POSIXACL},
 #else
-	{"acl",		.setmask = 1<<REISERFS_UNSUPPORTED_OPT},
-	{"noacl",	.clrmask = 1<<REISERFS_UNSUPPORTED_OPT},
+		{"acl",.setmask = 1 << REISERFS_UNSUPPORTED_OPT},
+		{"noacl",.clrmask = 1 << REISERFS_UNSUPPORTED_OPT},
 #endif
-	{"nolog",},	 /* This is unsupported */
-	{"replayonly",	.setmask = 1<<REPLAYONLY},
-	{"block-allocator", .arg_required = 'a', .values = balloc},
-	{"data",	.arg_required = 'd', .values = logging_mode},
-	{"barrier",	.arg_required = 'b', .values = barrier_mode},
-	{"resize",	.arg_required = 'r', .values = NULL},
-	{"jdev",	.arg_required = 'j', .values = NULL},
-	{"nolargeio",	.arg_required = 'w', .values = NULL},
-	{"commit",	.arg_required = 'c', .values = NULL},
-	{"usrquota",	.setmask = 1<<REISERFS_QUOTA},
-	{"grpquota",	.setmask = 1<<REISERFS_QUOTA},
-	{"noquota",	.clrmask = 1<<REISERFS_QUOTA},
-	{"errors", 	.arg_required = 'e', .values = error_actions},
-	{"usrjquota",	.arg_required = 'u'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
-	{"grpjquota",	.arg_required = 'g'|(1<<REISERFS_OPT_ALLOWEMPTY), .values = NULL},
-	{"jqfmt",	.arg_required = 'f', .values = NULL},
-	{NULL,}
-    };
-	
-    *blocks = 0;
-    if (!options || !*options)
-	/* use default configuration: create tails, journaling on, no
-	   conversion to newest format */
-	return 1;
-    
-    for (pos = options; pos; ) {
-	c = reiserfs_getopt (s, &pos, opts, &arg, mount_options);
-	if (c == -1)
-	    /* wrong option is given */
-	    return 0;
-	
-	if (c == 'r') {
-	    char * p;
-	    
-	    p = NULL;
-	    /* "resize=NNN" or "resize=auto" */
-
-	    if (!strcmp(arg, "auto")) {
-		    /* From JFS code, to auto-get the size.*/
-		    *blocks = s->s_bdev->bd_inode->i_size >> s->s_blocksize_bits;
-	    } else {
-		    *blocks = simple_strtoul (arg, &p, 0);
-		    if (*p != '\0') {
-			/* NNN does not look like a number */
-			reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg);
+		{"nolog",},	/* This is unsupported */
+		{"replayonly",.setmask = 1 << REPLAYONLY},
+		{"block-allocator",.arg_required = 'a',.values = balloc},
+		{"data",.arg_required = 'd',.values = logging_mode},
+		{"barrier",.arg_required = 'b',.values = barrier_mode},
+		{"resize",.arg_required = 'r',.values = NULL},
+		{"jdev",.arg_required = 'j',.values = NULL},
+		{"nolargeio",.arg_required = 'w',.values = NULL},
+		{"commit",.arg_required = 'c',.values = NULL},
+		{"usrquota",.setmask = 1 << REISERFS_QUOTA},
+		{"grpquota",.setmask = 1 << REISERFS_QUOTA},
+		{"noquota",.clrmask = 1 << REISERFS_QUOTA},
+		{"errors",.arg_required = 'e',.values = error_actions},
+		{"usrjquota",.arg_required =
+		 'u' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"grpjquota",.arg_required =
+		 'g' | (1 << REISERFS_OPT_ALLOWEMPTY),.values = NULL},
+		{"jqfmt",.arg_required = 'f',.values = NULL},
+		{NULL,}
+	};
+
+	*blocks = 0;
+	if (!options || !*options)
+		/* use default configuration: create tails, journaling on, no
+		   conversion to newest format */
+		return 1;
+
+	for (pos = options; pos;) {
+		c = reiserfs_getopt(s, &pos, opts, &arg, mount_options);
+		if (c == -1)
+			/* wrong option is given */
 			return 0;
-		    }
-	    }
-	}
 
-	if ( c == 'c' ) {
-		char *p = NULL;
-		unsigned long val = simple_strtoul (arg, &p, 0);
-		/* commit=NNN (time in seconds) */
-		if ( *p != '\0' || val >= (unsigned int)-1) {
-			reiserfs_warning (s, "reiserfs_parse_options: bad value %s", arg);
-			return 0;
+		if (c == 'r') {
+			char *p;
+
+			p = NULL;
+			/* "resize=NNN" or "resize=auto" */
+
+			if (!strcmp(arg, "auto")) {
+				/* From JFS code, to auto-get the size. */
+				*blocks =
+				    s->s_bdev->bd_inode->i_size >> s->
+				    s_blocksize_bits;
+			} else {
+				*blocks = simple_strtoul(arg, &p, 0);
+				if (*p != '\0') {
+					/* NNN does not look like a number */
+					reiserfs_warning(s,
+							 "reiserfs_parse_options: bad value %s",
+							 arg);
+					return 0;
+				}
+			}
 		}
-		*commit_max_age = (unsigned int)val;
-	}
 
-	if ( c == 'w' ) {
-		char *p=NULL;
-		int val = simple_strtoul (arg, &p, 0);
-
-		if ( *p != '\0') {
-		    reiserfs_warning (s, "reiserfs_parse_options: non-numeric value %s for nolargeio option", arg);
-		    return 0;
+		if (c == 'c') {
+			char *p = NULL;
+			unsigned long val = simple_strtoul(arg, &p, 0);
+			/* commit=NNN (time in seconds) */
+			if (*p != '\0' || val >= (unsigned int)-1) {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: bad value %s",
+						 arg);
+				return 0;
+			}
+			*commit_max_age = (unsigned int)val;
 		}
-		if ( val ) 
-		    reiserfs_default_io_size = PAGE_SIZE;
-		else
-		    reiserfs_default_io_size = 128 * 1024;
-	}
 
-	if (c == 'j') {
-	    if (arg && *arg && jdev_name) {
-		if ( *jdev_name ) { //Hm, already assigned?
-		    reiserfs_warning (s, "reiserfs_parse_options: journal device was already  specified to be %s", *jdev_name);
-		    return 0;
+		if (c == 'w') {
+			char *p = NULL;
+			int val = simple_strtoul(arg, &p, 0);
+
+			if (*p != '\0') {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: non-numeric value %s for nolargeio option",
+						 arg);
+				return 0;
+			}
+			if (val)
+				reiserfs_default_io_size = PAGE_SIZE;
+			else
+				reiserfs_default_io_size = 128 * 1024;
 		}
-		*jdev_name = arg;
-	    }
-	}
 
-#ifdef CONFIG_QUOTA
-	if (c == 'u' || c == 'g') {
-	    int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
-
-	    if (sb_any_quota_enabled(s)) {
-		reiserfs_warning(s, "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
-		return 0;
-	    }
-	    if (*arg) {	/* Some filename specified? */
-	        if (REISERFS_SB(s)->s_qf_names[qtype] && strcmp(REISERFS_SB(s)->s_qf_names[qtype], arg)) {
-		    reiserfs_warning(s, "reiserfs_parse_options: %s quota file already specified.", QTYPE2NAME(qtype));
-		    return 0;
+		if (c == 'j') {
+			if (arg && *arg && jdev_name) {
+				if (*jdev_name) {	//Hm, already assigned?
+					reiserfs_warning(s,
+							 "reiserfs_parse_options: journal device was already  specified to be %s",
+							 *jdev_name);
+					return 0;
+				}
+				*jdev_name = arg;
+			}
 		}
-		if (strchr(arg, '/')) {
-		    reiserfs_warning(s, "reiserfs_parse_options: quotafile must be on filesystem root.");
-		    return 0;
+#ifdef CONFIG_QUOTA
+		if (c == 'u' || c == 'g') {
+			int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
+
+			if (sb_any_quota_enabled(s)) {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: cannot change journalled quota options when quota turned on.");
+				return 0;
+			}
+			if (*arg) {	/* Some filename specified? */
+				if (REISERFS_SB(s)->s_qf_names[qtype]
+				    && strcmp(REISERFS_SB(s)->s_qf_names[qtype],
+					      arg)) {
+					reiserfs_warning(s,
+							 "reiserfs_parse_options: %s quota file already specified.",
+							 QTYPE2NAME(qtype));
+					return 0;
+				}
+				if (strchr(arg, '/')) {
+					reiserfs_warning(s,
+							 "reiserfs_parse_options: quotafile must be on filesystem root.");
+					return 0;
+				}
+				REISERFS_SB(s)->s_qf_names[qtype] =
+				    kmalloc(strlen(arg) + 1, GFP_KERNEL);
+				if (!REISERFS_SB(s)->s_qf_names[qtype]) {
+					reiserfs_warning(s,
+							 "reiserfs_parse_options: not enough memory for storing quotafile name.");
+					return 0;
+				}
+				strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
+				*mount_options |= 1 << REISERFS_QUOTA;
+			} else {
+				if (REISERFS_SB(s)->s_qf_names[qtype]) {
+					kfree(REISERFS_SB(s)->
+					      s_qf_names[qtype]);
+					REISERFS_SB(s)->s_qf_names[qtype] =
+					    NULL;
+				}
+			}
 		}
-	    	REISERFS_SB(s)->s_qf_names[qtype] = kmalloc(strlen(arg)+1, GFP_KERNEL);
-		if (!REISERFS_SB(s)->s_qf_names[qtype]) {
-		    reiserfs_warning(s, "reiserfs_parse_options: not enough memory for storing quotafile name.");
-		    return 0;
+		if (c == 'f') {
+			if (!strcmp(arg, "vfsold"))
+				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
+			else if (!strcmp(arg, "vfsv0"))
+				REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
+			else {
+				reiserfs_warning(s,
+						 "reiserfs_parse_options: unknown quota format specified.");
+				return 0;
+			}
 		}
-		strcpy(REISERFS_SB(s)->s_qf_names[qtype], arg);
-		*mount_options |= 1<<REISERFS_QUOTA;
-	    }
-	    else {
-		if (REISERFS_SB(s)->s_qf_names[qtype]) {
-		    kfree(REISERFS_SB(s)->s_qf_names[qtype]);
-		    REISERFS_SB(s)->s_qf_names[qtype] = NULL;
+#else
+		if (c == 'u' || c == 'g' || c == 'f') {
+			reiserfs_warning(s,
+					 "reiserfs_parse_options: journalled quota options not supported.");
+			return 0;
 		}
-	    }
-	}
-	if (c == 'f') {
-	    if (!strcmp(arg, "vfsold"))
-		REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_OLD;
-	    else if (!strcmp(arg, "vfsv0"))
-		REISERFS_SB(s)->s_jquota_fmt = QFMT_VFS_V0;
-	    else {
-		reiserfs_warning(s, "reiserfs_parse_options: unknown quota format specified.");
+#endif
+	}
+
+#ifdef CONFIG_QUOTA
+	if (!REISERFS_SB(s)->s_jquota_fmt
+	    && (REISERFS_SB(s)->s_qf_names[USRQUOTA]
+		|| REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
+		reiserfs_warning(s,
+				 "reiserfs_parse_options: journalled quota format not specified.");
 		return 0;
-	    }
 	}
-#else
-	if (c == 'u' || c == 'g' || c == 'f') {
-	    reiserfs_warning(s, "reiserfs_parse_options: journalled quota options not supported.");
-	    return 0;
+	/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
+	if (!(*mount_options & (1 << REISERFS_QUOTA))
+	    && sb_any_quota_enabled(s)) {
+		reiserfs_warning(s,
+				 "reiserfs_parse_options: quota options must be present when quota is turned on.");
+		return 0;
 	}
 #endif
-    }
-    
-#ifdef CONFIG_QUOTA
-    if (!REISERFS_SB(s)->s_jquota_fmt && (REISERFS_SB(s)->s_qf_names[USRQUOTA] || REISERFS_SB(s)->s_qf_names[GRPQUOTA])) {
-	reiserfs_warning(s, "reiserfs_parse_options: journalled quota format not specified.");
-	return 0;
-    }
-    /* This checking is not precise wrt the quota type but for our purposes it is sufficient */
-    if (!(*mount_options & (1<<REISERFS_QUOTA)) && sb_any_quota_enabled(s)) {
-	reiserfs_warning(s, "reiserfs_parse_options: quota options must be present when quota is turned on.");
-	return 0;
-    }
-#endif
 
-    return 1;
+	return 1;
 }
 
-static void switch_data_mode(struct super_block *s, unsigned long mode) {
-    REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
-                                       (1 << REISERFS_DATA_ORDERED) |
-				       (1 << REISERFS_DATA_WRITEBACK));
-    REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+static void switch_data_mode(struct super_block *s, unsigned long mode)
+{
+	REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+					 (1 << REISERFS_DATA_ORDERED) |
+					 (1 << REISERFS_DATA_WRITEBACK));
+	REISERFS_SB(s)->s_mount_opt |= (1 << mode);
 }
 
 static void handle_data_mode(struct super_block *s, unsigned long mount_options)
 {
-    if (mount_options & (1 << REISERFS_DATA_LOG)) {
-        if (!reiserfs_data_log(s)) {
-	    switch_data_mode(s, REISERFS_DATA_LOG);
-	    reiserfs_info (s, "switching to journaled data mode\n");
-	}
-    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
-        if (!reiserfs_data_ordered(s)) {
-	    switch_data_mode(s, REISERFS_DATA_ORDERED);
-	    reiserfs_info (s, "switching to ordered data mode\n");
-	}
-    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
-        if (!reiserfs_data_writeback(s)) {
-	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
-	    reiserfs_info (s, "switching to writeback data mode\n");
-	}
-    }
+	if (mount_options & (1 << REISERFS_DATA_LOG)) {
+		if (!reiserfs_data_log(s)) {
+			switch_data_mode(s, REISERFS_DATA_LOG);
+			reiserfs_info(s, "switching to journaled data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+		if (!reiserfs_data_ordered(s)) {
+			switch_data_mode(s, REISERFS_DATA_ORDERED);
+			reiserfs_info(s, "switching to ordered data mode\n");
+		}
+	} else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+		if (!reiserfs_data_writeback(s)) {
+			switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+			reiserfs_info(s, "switching to writeback data mode\n");
+		}
+	}
 }
 
-static void handle_barrier_mode(struct super_block *s, unsigned long bits) {
-    int flush = (1 << REISERFS_BARRIER_FLUSH);
-    int none = (1 << REISERFS_BARRIER_NONE);
-    int all_barrier = flush | none;
-
-    if (bits & all_barrier) {
-        REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
-	if (bits & flush) {
-	    REISERFS_SB(s)->s_mount_opt |= flush;
-	    printk("reiserfs: enabling write barrier flush mode\n");
-	} else if (bits & none) {
-	    REISERFS_SB(s)->s_mount_opt |= none;
-	    printk("reiserfs: write barriers turned off\n");
-	}
-   }
+static void handle_barrier_mode(struct super_block *s, unsigned long bits)
+{
+	int flush = (1 << REISERFS_BARRIER_FLUSH);
+	int none = (1 << REISERFS_BARRIER_NONE);
+	int all_barrier = flush | none;
+
+	if (bits & all_barrier) {
+		REISERFS_SB(s)->s_mount_opt &= ~all_barrier;
+		if (bits & flush) {
+			REISERFS_SB(s)->s_mount_opt |= flush;
+			printk("reiserfs: enabling write barrier flush mode\n");
+		} else if (bits & none) {
+			REISERFS_SB(s)->s_mount_opt |= none;
+			printk("reiserfs: write barriers turned off\n");
+		}
+	}
 }
 
-static void handle_attrs( struct super_block *s )
+static void handle_attrs(struct super_block *s)
 {
-	struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
 
-	if( reiserfs_attrs( s ) ) {
-		if( old_format_only(s) ) {
-			reiserfs_warning(s, "reiserfs: cannot support attributes on 3.5.x disk format" );
-			REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS );
+	if (reiserfs_attrs(s)) {
+		if (old_format_only(s)) {
+			reiserfs_warning(s,
+					 "reiserfs: cannot support attributes on 3.5.x disk format");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 			return;
 		}
-		if( !( le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared ) ) {
-				reiserfs_warning(s, "reiserfs: cannot support attributes until flag is set in super-block" );
-				REISERFS_SB(s) -> s_mount_opt &= ~ ( 1 << REISERFS_ATTRS );
+		if (!(le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared)) {
+			reiserfs_warning(s,
+					 "reiserfs: cannot support attributes until flag is set in super-block");
+			REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS);
 		}
-	} else if (le32_to_cpu( rs -> s_flags ) & reiserfs_attrs_cleared) {
+	} else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) {
 		REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS;
 	}
 }
 
-static int reiserfs_remount (struct super_block * s, int * mount_flags, char * arg)
+static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 {
-  struct reiserfs_super_block * rs;
-  struct reiserfs_transaction_handle th ;
-  unsigned long blocks;
-  unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
-  unsigned long safe_mask = 0;
-  unsigned int commit_max_age = (unsigned int)-1;
-  struct reiserfs_journal *journal = SB_JOURNAL(s);
-  int err;
+	struct reiserfs_super_block *rs;
+	struct reiserfs_transaction_handle th;
+	unsigned long blocks;
+	unsigned long mount_options = REISERFS_SB(s)->s_mount_opt;
+	unsigned long safe_mask = 0;
+	unsigned int commit_max_age = (unsigned int)-1;
+	struct reiserfs_journal *journal = SB_JOURNAL(s);
+	int err;
 #ifdef CONFIG_QUOTA
-  int i;
+	int i;
 #endif
 
-  rs = SB_DISK_SUPER_BLOCK (s);
+	rs = SB_DISK_SUPER_BLOCK(s);
 
-  if (!reiserfs_parse_options(s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
+	if (!reiserfs_parse_options
+	    (s, arg, &mount_options, &blocks, NULL, &commit_max_age)) {
 #ifdef CONFIG_QUOTA
-    for (i = 0; i < MAXQUOTAS; i++)
-	if (REISERFS_SB(s)->s_qf_names[i]) {
-	    kfree(REISERFS_SB(s)->s_qf_names[i]);
-	    REISERFS_SB(s)->s_qf_names[i] = NULL;
-	}
+		for (i = 0; i < MAXQUOTAS; i++)
+			if (REISERFS_SB(s)->s_qf_names[i]) {
+				kfree(REISERFS_SB(s)->s_qf_names[i]);
+				REISERFS_SB(s)->s_qf_names[i] = NULL;
+			}
 #endif
-    return -EINVAL;
-  }
-  
-  handle_attrs(s);
-
-  /* Add options that are safe here */
-  safe_mask |= 1 << REISERFS_SMALLTAIL;
-  safe_mask |= 1 << REISERFS_LARGETAIL;
-  safe_mask |= 1 << REISERFS_NO_BORDER;
-  safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
-  safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
-  safe_mask |= 1 << REISERFS_TEST4;
-  safe_mask |= 1 << REISERFS_ATTRS;
-  safe_mask |= 1 << REISERFS_XATTRS_USER;
-  safe_mask |= 1 << REISERFS_POSIXACL;
-  safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
-  safe_mask |= 1 << REISERFS_BARRIER_NONE;
-  safe_mask |= 1 << REISERFS_ERROR_RO;
-  safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
-  safe_mask |= 1 << REISERFS_ERROR_PANIC;
-  safe_mask |= 1 << REISERFS_QUOTA;
-
-  /* Update the bitmask, taking care to keep
-   * the bits we're not allowed to change here */
-  REISERFS_SB(s)->s_mount_opt = (REISERFS_SB(s)->s_mount_opt & ~safe_mask) |  (mount_options & safe_mask);
-
-  if(commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
-    journal->j_max_commit_age = commit_max_age;
-    journal->j_max_trans_age = commit_max_age;
-  }
-  else if(commit_max_age == 0)
-  {
-    /* 0 means restore defaults. */
-    journal->j_max_commit_age = journal->j_default_max_commit_age;
-    journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
-  }
-
-  if(blocks) {
-    int rc = reiserfs_resize(s, blocks);
-    if (rc != 0)
-      return rc;
-  }
-
-  if (*mount_flags & MS_RDONLY) {
-    reiserfs_xattr_init (s, *mount_flags);
-    /* remount read-only */
-    if (s->s_flags & MS_RDONLY)
-      /* it is read-only already */
-      return 0;
-    /* try to remount file system with read-only permissions */
-    if (sb_umount_state(rs) == REISERFS_VALID_FS || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
-      return 0;
-    }
-
-    err = journal_begin(&th, s, 10) ;
-    if (err)
-        return err;
-
-    /* Mounting a rw partition read-only. */
-    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-    set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
-    journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-  } else {
-    /* remount read-write */
-    if (!(s->s_flags & MS_RDONLY)) {
-	reiserfs_xattr_init (s, *mount_flags);
-	return 0; /* We are read-write already */
-    }
-
-    if (reiserfs_is_journal_aborted (journal))
-	return journal->j_errno;
-
-    handle_data_mode(s, mount_options);
-    handle_barrier_mode(s, mount_options);
-    REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
-    s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
-    err = journal_begin(&th, s, 10) ;
-    if (err)
-	return err;
-    
-    /* Mount a partition which is read-only, read-write */
-    reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-    REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
-    s->s_flags &= ~MS_RDONLY;
-    set_sb_umount_state( rs, REISERFS_ERROR_FS );
-    /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
-    journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
-  }
-  /* this will force a full flush of all journal lists */
-  SB_JOURNAL(s)->j_must_wait = 1 ;
-  err = journal_end(&th, s, 10) ;
-  if (err)
-    return err;
-  s->s_dirt = 0;
-
-  if (!( *mount_flags & MS_RDONLY ) ) {
-    finish_unfinished( s );
-    reiserfs_xattr_init (s, *mount_flags);
-  }
-
-  return 0;
+		return -EINVAL;
+	}
+
+	handle_attrs(s);
+
+	/* Add options that are safe here */
+	safe_mask |= 1 << REISERFS_SMALLTAIL;
+	safe_mask |= 1 << REISERFS_LARGETAIL;
+	safe_mask |= 1 << REISERFS_NO_BORDER;
+	safe_mask |= 1 << REISERFS_NO_UNHASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_HASHED_RELOCATION;
+	safe_mask |= 1 << REISERFS_TEST4;
+	safe_mask |= 1 << REISERFS_ATTRS;
+	safe_mask |= 1 << REISERFS_XATTRS_USER;
+	safe_mask |= 1 << REISERFS_POSIXACL;
+	safe_mask |= 1 << REISERFS_BARRIER_FLUSH;
+	safe_mask |= 1 << REISERFS_BARRIER_NONE;
+	safe_mask |= 1 << REISERFS_ERROR_RO;
+	safe_mask |= 1 << REISERFS_ERROR_CONTINUE;
+	safe_mask |= 1 << REISERFS_ERROR_PANIC;
+	safe_mask |= 1 << REISERFS_QUOTA;
+
+	/* Update the bitmask, taking care to keep
+	 * the bits we're not allowed to change here */
+	REISERFS_SB(s)->s_mount_opt =
+	    (REISERFS_SB(s)->
+	     s_mount_opt & ~safe_mask) | (mount_options & safe_mask);
+
+	if (commit_max_age != 0 && commit_max_age != (unsigned int)-1) {
+		journal->j_max_commit_age = commit_max_age;
+		journal->j_max_trans_age = commit_max_age;
+	} else if (commit_max_age == 0) {
+		/* 0 means restore defaults. */
+		journal->j_max_commit_age = journal->j_default_max_commit_age;
+		journal->j_max_trans_age = JOURNAL_MAX_TRANS_AGE;
+	}
+
+	if (blocks) {
+		int rc = reiserfs_resize(s, blocks);
+		if (rc != 0)
+			return rc;
+	}
+
+	if (*mount_flags & MS_RDONLY) {
+		reiserfs_xattr_init(s, *mount_flags);
+		/* remount read-only */
+		if (s->s_flags & MS_RDONLY)
+			/* it is read-only already */
+			return 0;
+		/* try to remount file system with read-only permissions */
+		if (sb_umount_state(rs) == REISERFS_VALID_FS
+		    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
+			return 0;
+		}
+
+		err = journal_begin(&th, s, 10);
+		if (err)
+			return err;
+
+		/* Mounting a rw partition read-only. */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		set_sb_umount_state(rs, REISERFS_SB(s)->s_mount_state);
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+	} else {
+		/* remount read-write */
+		if (!(s->s_flags & MS_RDONLY)) {
+			reiserfs_xattr_init(s, *mount_flags);
+			return 0;	/* We are read-write already */
+		}
+
+		if (reiserfs_is_journal_aborted(journal))
+			return journal->j_errno;
+
+		handle_data_mode(s, mount_options);
+		handle_barrier_mode(s, mount_options);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+		s->s_flags &= ~MS_RDONLY;	/* now it is safe to call journal_begin */
+		err = journal_begin(&th, s, 10);
+		if (err)
+			return err;
+
+		/* Mount a partition which is read-only, read-write */
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+		REISERFS_SB(s)->s_mount_state = sb_umount_state(rs);
+		s->s_flags &= ~MS_RDONLY;
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		/* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS;
+	}
+	/* this will force a full flush of all journal lists */
+	SB_JOURNAL(s)->j_must_wait = 1;
+	err = journal_end(&th, s, 10);
+	if (err)
+		return err;
+	s->s_dirt = 0;
+
+	if (!(*mount_flags & MS_RDONLY)) {
+		finish_unfinished(s);
+		reiserfs_xattr_init(s, *mount_flags);
+	}
+
+	return 0;
 }
 
 /* load_bitmap_info_data - Sets up the reiserfs_bitmap_info structure from disk.
@@ -1214,761 +1285,829 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
  * free blocks at all.
  */
 
-static void load_bitmap_info_data (struct super_block *sb,
-                                   struct reiserfs_bitmap_info *bi)
+static void load_bitmap_info_data(struct super_block *sb,
+				  struct reiserfs_bitmap_info *bi)
 {
-    unsigned long *cur = (unsigned long *)bi->bh->b_data;
-
-    while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) {
-
-	/* No need to scan if all 0's or all 1's.
-	 * Since we're only counting 0's, we can simply ignore all 1's */
-	if (*cur == 0) {
-	    if (bi->first_zero_hint == 0) {
-		bi->first_zero_hint = ((char *)cur - bi->bh->b_data) << 3;
-	    }
-	    bi->free_count += sizeof(unsigned long)*8;
-	} else if (*cur != ~0L) {
-	    int b;
-	    for (b = 0; b < sizeof(unsigned long)*8; b++) {
-		if (!reiserfs_test_le_bit (b, cur)) {
-		    bi->free_count ++;
-		    if (bi->first_zero_hint == 0)
-			bi->first_zero_hint =
-					(((char *)cur - bi->bh->b_data) << 3) + b;
-		    }
+	unsigned long *cur = (unsigned long *)bi->bh->b_data;
+
+	while ((char *)cur < (bi->bh->b_data + sb->s_blocksize)) {
+
+		/* No need to scan if all 0's or all 1's.
+		 * Since we're only counting 0's, we can simply ignore all 1's */
+		if (*cur == 0) {
+			if (bi->first_zero_hint == 0) {
+				bi->first_zero_hint =
+				    ((char *)cur - bi->bh->b_data) << 3;
+			}
+			bi->free_count += sizeof(unsigned long) * 8;
+		} else if (*cur != ~0L) {
+			int b;
+			for (b = 0; b < sizeof(unsigned long) * 8; b++) {
+				if (!reiserfs_test_le_bit(b, cur)) {
+					bi->free_count++;
+					if (bi->first_zero_hint == 0)
+						bi->first_zero_hint =
+						    (((char *)cur -
+						      bi->bh->b_data) << 3) + b;
+				}
+			}
 		}
-	    }
-	cur ++;
-    }
+		cur++;
+	}
 
 #ifdef CONFIG_REISERFS_CHECK
 // This outputs a lot of unneded info on big FSes
 //    reiserfs_warning ("bitmap loaded from block %d: %d free blocks",
-//		      bi->bh->b_blocknr, bi->free_count);
+//                    bi->bh->b_blocknr, bi->free_count);
 #endif
 }
-  
-static int read_bitmaps (struct super_block * s)
+
+static int read_bitmaps(struct super_block *s)
 {
-    int i, bmap_nr;
+	int i, bmap_nr;
+
+	SB_AP_BITMAP(s) =
+	    vmalloc(sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
+	if (SB_AP_BITMAP(s) == 0)
+		return 1;
+	memset(SB_AP_BITMAP(s), 0,
+	       sizeof(struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
+	for (i = 0, bmap_nr =
+	     REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1;
+	     i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) {
+		SB_AP_BITMAP(s)[i].bh = sb_getblk(s, bmap_nr);
+		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh))
+			ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh);
+	}
+	for (i = 0; i < SB_BMAP_NR(s); i++) {
+		wait_on_buffer(SB_AP_BITMAP(s)[i].bh);
+		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
+			reiserfs_warning(s, "sh-2029: reiserfs read_bitmaps: "
+					 "bitmap block (#%lu) reading failed",
+					 SB_AP_BITMAP(s)[i].bh->b_blocknr);
+			for (i = 0; i < SB_BMAP_NR(s); i++)
+				brelse(SB_AP_BITMAP(s)[i].bh);
+			vfree(SB_AP_BITMAP(s));
+			SB_AP_BITMAP(s) = NULL;
+			return 1;
+		}
+		load_bitmap_info_data(s, SB_AP_BITMAP(s) + i);
+	}
+	return 0;
+}
 
-    SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
-    if (SB_AP_BITMAP (s) == 0)
-	return 1;
-    memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_bitmap_info) * SB_BMAP_NR(s));
-    for (i = 0, bmap_nr = REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize + 1;
-	 i < SB_BMAP_NR(s); i++, bmap_nr = s->s_blocksize * 8 * i) {
-	SB_AP_BITMAP (s)[i].bh = sb_getblk(s, bmap_nr);
-	if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh))
-	    ll_rw_block(READ, 1, &SB_AP_BITMAP(s)[i].bh);
-    }
-    for (i = 0; i < SB_BMAP_NR(s); i++) {
-	wait_on_buffer(SB_AP_BITMAP (s)[i].bh);
-	if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
-	    reiserfs_warning(s,"sh-2029: reiserfs read_bitmaps: "
-			 "bitmap block (#%lu) reading failed",
-			 SB_AP_BITMAP(s)[i].bh->b_blocknr);
-	    for (i = 0; i < SB_BMAP_NR(s); i++)
-		brelse(SB_AP_BITMAP(s)[i].bh);
-	    vfree(SB_AP_BITMAP(s));
-	    SB_AP_BITMAP(s) = NULL;
-	    return 1;
+static int read_old_bitmaps(struct super_block *s)
+{
+	int i;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+	int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1;	/* first of bitmap blocks */
+
+	/* read true bitmap */
+	SB_AP_BITMAP(s) =
+	    vmalloc(sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
+	if (SB_AP_BITMAP(s) == 0)
+		return 1;
+
+	memset(SB_AP_BITMAP(s), 0,
+	       sizeof(struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
+
+	for (i = 0; i < sb_bmap_nr(rs); i++) {
+		SB_AP_BITMAP(s)[i].bh = sb_bread(s, bmp1 + i);
+		if (!SB_AP_BITMAP(s)[i].bh)
+			return 1;
+		load_bitmap_info_data(s, SB_AP_BITMAP(s) + i);
 	}
-	load_bitmap_info_data (s, SB_AP_BITMAP (s) + i);
-    }
-    return 0;
+
+	return 0;
 }
 
-static int read_old_bitmaps (struct super_block * s)
+static int read_super_block(struct super_block *s, int offset)
 {
-  int i ;
-  struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s);
-  int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1;  /* first of bitmap blocks */
+	struct buffer_head *bh;
+	struct reiserfs_super_block *rs;
+	int fs_blocksize;
 
-  /* read true bitmap */
-  SB_AP_BITMAP (s) = vmalloc (sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
-  if (SB_AP_BITMAP (s) == 0)
-    return 1;
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2006: read_super_block: "
+				 "bread failed (dev %s, block %lu, size %lu)",
+				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
 
-  memset (SB_AP_BITMAP (s), 0, sizeof (struct reiserfs_buffer_info *) * sb_bmap_nr(rs));
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (!is_any_reiserfs_magic_string(rs)) {
+		brelse(bh);
+		return 1;
+	}
+	//
+	// ok, reiserfs signature (old or new) found in at the given offset
+	//    
+	fs_blocksize = sb_blocksize(rs);
+	brelse(bh);
+	sb_set_blocksize(s, fs_blocksize);
 
-  for (i = 0; i < sb_bmap_nr(rs); i ++) {
-    SB_AP_BITMAP (s)[i].bh = sb_bread (s, bmp1 + i);
-    if (!SB_AP_BITMAP (s)[i].bh)
-      return 1;
-    load_bitmap_info_data (s, SB_AP_BITMAP (s) + i);
-  }
+	bh = sb_bread(s, offset / s->s_blocksize);
+	if (!bh) {
+		reiserfs_warning(s, "sh-2007: read_super_block: "
+				 "bread failed (dev %s, block %lu, size %lu)\n",
+				 reiserfs_bdevname(s), offset / s->s_blocksize,
+				 s->s_blocksize);
+		return 1;
+	}
 
-  return 0;
-}
+	rs = (struct reiserfs_super_block *)bh->b_data;
+	if (sb_blocksize(rs) != s->s_blocksize) {
+		reiserfs_warning(s, "sh-2011: read_super_block: "
+				 "can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n",
+				 reiserfs_bdevname(s),
+				 (unsigned long long)bh->b_blocknr,
+				 s->s_blocksize);
+		brelse(bh);
+		return 1;
+	}
 
-static int read_super_block (struct super_block * s, int offset)
-{
-    struct buffer_head * bh;
-    struct reiserfs_super_block * rs;
-    int fs_blocksize;
- 
-
-    bh = sb_bread (s, offset / s->s_blocksize);
-    if (!bh) {
-      reiserfs_warning (s, "sh-2006: read_super_block: "
-              "bread failed (dev %s, block %lu, size %lu)",
-              reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize);
-      return 1;
-    }
- 
-    rs = (struct reiserfs_super_block *)bh->b_data;
-    if (!is_any_reiserfs_magic_string (rs)) {
-      brelse (bh);
-      return 1;
-    }
- 
-    //
-    // ok, reiserfs signature (old or new) found in at the given offset
-    //    
-    fs_blocksize = sb_blocksize(rs);
-    brelse (bh);
-    sb_set_blocksize (s, fs_blocksize);
-    
-    bh = sb_bread (s, offset / s->s_blocksize);
-    if (!bh) {
-	reiserfs_warning (s, "sh-2007: read_super_block: "
-                "bread failed (dev %s, block %lu, size %lu)\n",
-                reiserfs_bdevname (s), offset / s->s_blocksize, s->s_blocksize);
-	return 1;
-    }
-    
-    rs = (struct reiserfs_super_block *)bh->b_data;
-    if (sb_blocksize(rs) != s->s_blocksize) {
-	reiserfs_warning (s, "sh-2011: read_super_block: "
-		"can't find a reiserfs filesystem on (dev %s, block %Lu, size %lu)\n",
-		reiserfs_bdevname (s), (unsigned long long)bh->b_blocknr, s->s_blocksize);
-	brelse (bh);
-	return 1;
-    }
-
-    if ( rs->s_v1.s_root_block == cpu_to_le32(-1) ) {
-       brelse(bh) ;
-       reiserfs_warning (s, "Unfinished reiserfsck --rebuild-tree run detected. Please run\n"
-              "reiserfsck --rebuild-tree and wait for a completion. If that fails\n"
-              "get newer reiserfsprogs package");
-       return 1;
-    }
-
-    SB_BUFFER_WITH_SB (s) = bh;
-    SB_DISK_SUPER_BLOCK (s) = rs;
-
-    if (is_reiserfs_jr (rs)) {
-	/* magic is of non-standard journal filesystem, look at s_version to
-	   find which format is in use */
-	if (sb_version(rs) == REISERFS_VERSION_2)
-	  reiserfs_warning (s, "read_super_block: found reiserfs format \"3.6\""
-		  " with non-standard journal");
-	else if (sb_version(rs) == REISERFS_VERSION_1)
-	  reiserfs_warning (s, "read_super_block: found reiserfs format \"3.5\""
-		  " with non-standard journal");
-	else {
-	  reiserfs_warning (s, "sh-2012: read_super_block: found unknown "
-			    "format \"%u\" of reiserfs with non-standard magic",
-			    sb_version(rs));
-	return 1;
+	if (rs->s_v1.s_root_block == cpu_to_le32(-1)) {
+		brelse(bh);
+		reiserfs_warning(s,
+				 "Unfinished reiserfsck --rebuild-tree run detected. Please run\n"
+				 "reiserfsck --rebuild-tree and wait for a completion. If that fails\n"
+				 "get newer reiserfsprogs package");
+		return 1;
 	}
-    }
-    else
-      /* s_version of standard format may contain incorrect information,
-	 so we just look at the magic string */
-      reiserfs_info (s, "found reiserfs format \"%s\" with standard journal\n",
-	      is_reiserfs_3_5 (rs) ? "3.5" : "3.6");
 
-    s->s_op = &reiserfs_sops;
-    s->s_export_op = &reiserfs_export_ops;
+	SB_BUFFER_WITH_SB(s) = bh;
+	SB_DISK_SUPER_BLOCK(s) = rs;
+
+	if (is_reiserfs_jr(rs)) {
+		/* magic is of non-standard journal filesystem, look at s_version to
+		   find which format is in use */
+		if (sb_version(rs) == REISERFS_VERSION_2)
+			reiserfs_warning(s,
+					 "read_super_block: found reiserfs format \"3.6\""
+					 " with non-standard journal");
+		else if (sb_version(rs) == REISERFS_VERSION_1)
+			reiserfs_warning(s,
+					 "read_super_block: found reiserfs format \"3.5\""
+					 " with non-standard journal");
+		else {
+			reiserfs_warning(s,
+					 "sh-2012: read_super_block: found unknown "
+					 "format \"%u\" of reiserfs with non-standard magic",
+					 sb_version(rs));
+			return 1;
+		}
+	} else
+		/* s_version of standard format may contain incorrect information,
+		   so we just look at the magic string */
+		reiserfs_info(s,
+			      "found reiserfs format \"%s\" with standard journal\n",
+			      is_reiserfs_3_5(rs) ? "3.5" : "3.6");
+
+	s->s_op = &reiserfs_sops;
+	s->s_export_op = &reiserfs_export_ops;
 #ifdef CONFIG_QUOTA
-    s->s_qcop = &reiserfs_qctl_operations;
-    s->dq_op = &reiserfs_quota_operations;
+	s->s_qcop = &reiserfs_qctl_operations;
+	s->dq_op = &reiserfs_quota_operations;
 #endif
 
-    /* new format is limited by the 32 bit wide i_blocks field, want to
-    ** be one full block below that.
-    */
-    s->s_maxbytes = (512LL << 32) - s->s_blocksize ;
-    return 0;
+	/* new format is limited by the 32 bit wide i_blocks field, want to
+	 ** be one full block below that.
+	 */
+	s->s_maxbytes = (512LL << 32) - s->s_blocksize;
+	return 0;
 }
 
-
-
 /* after journal replay, reread all bitmap and super blocks */
-static int reread_meta_blocks(struct super_block *s) {
-  int i ;
-  ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ;
-  wait_on_buffer(SB_BUFFER_WITH_SB(s)) ;
-  if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
-    reiserfs_warning (s, "reread_meta_blocks, error reading the super") ;
-    return 1 ;
-  }
-
-  for (i = 0; i < SB_BMAP_NR(s) ; i++) {
-    ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh)) ;
-    wait_on_buffer(SB_AP_BITMAP(s)[i].bh) ;
-    if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
-      reiserfs_warning (s, "reread_meta_blocks, error reading bitmap block number %d at %llu",
-        i, (unsigned long long)SB_AP_BITMAP(s)[i].bh->b_blocknr) ;
-      return 1 ;
-    }
-  }
-  return 0 ;
+static int reread_meta_blocks(struct super_block *s)
+{
+	int i;
+	ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
+	wait_on_buffer(SB_BUFFER_WITH_SB(s));
+	if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
+		reiserfs_warning(s,
+				 "reread_meta_blocks, error reading the super");
+		return 1;
+	}
 
-}
+	for (i = 0; i < SB_BMAP_NR(s); i++) {
+		ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i].bh));
+		wait_on_buffer(SB_AP_BITMAP(s)[i].bh);
+		if (!buffer_uptodate(SB_AP_BITMAP(s)[i].bh)) {
+			reiserfs_warning(s,
+					 "reread_meta_blocks, error reading bitmap block number %d at %llu",
+					 i,
+					 (unsigned long long)SB_AP_BITMAP(s)[i].
+					 bh->b_blocknr);
+			return 1;
+		}
+	}
+	return 0;
 
+}
 
 /////////////////////////////////////////////////////
 // hash detection stuff
 
-
 // if root directory is empty - we set default - Yura's - hash and
 // warn about it
 // FIXME: we look for only one name in a directory. If tea and yura
 // bith have the same value - we ask user to send report to the
 // mailing list
-static __u32 find_hash_out (struct super_block * s)
+static __u32 find_hash_out(struct super_block *s)
 {
-    int retval;
-    struct inode * inode;
-    struct cpu_key key;
-    INITIALIZE_PATH (path);
-    struct reiserfs_dir_entry de;
-    __u32 hash = DEFAULT_HASH;
-
-    inode = s->s_root->d_inode;
-
-    do { // Some serious "goto"-hater was there ;)
-	u32 teahash, r5hash, yurahash;
-
-	make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3);
-	retval = search_by_entry_key (s, &key, &path, &de);
-	if (retval == IO_ERROR) {
-	    pathrelse (&path);
-	    return UNSET_HASH ;
-	}
-	if (retval == NAME_NOT_FOUND)
-	    de.de_entry_num --;
-	set_de_name_and_namelen (&de);
-	if (deh_offset( &(de.de_deh[de.de_entry_num]) ) == DOT_DOT_OFFSET) {
-	    /* allow override in this case */
-	    if (reiserfs_rupasov_hash(s)) {
-		hash = YURA_HASH ;
-	    }
-	    reiserfs_warning(s,"FS seems to be empty, autodetect "
-	                     "is using the default hash");
-	    break;
-	}
-	r5hash=GET_HASH_VALUE (r5_hash (de.de_name, de.de_namelen));
-	teahash=GET_HASH_VALUE (keyed_hash (de.de_name, de.de_namelen));
-	yurahash=GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen));
-	if ( ( (teahash == r5hash) && (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash) ) ||
-	     ( (teahash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ||
-	     ( (r5hash == yurahash) && (yurahash == GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])))) ) ) {
-	    reiserfs_warning(s,"Unable to automatically detect hash function. "
-			     "Please mount with -o hash={tea,rupasov,r5}",
-			     reiserfs_bdevname (s));
-	    hash = UNSET_HASH;
-	    break;
-	}
-	if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == yurahash)
-	    hash = YURA_HASH;
-	else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == teahash)
-	    hash = TEA_HASH;
-	else if (GET_HASH_VALUE( deh_offset(&(de.de_deh[de.de_entry_num])) ) == r5hash)
-	    hash = R5_HASH;
-	else {
-	    reiserfs_warning (s,"Unrecognised hash function");
-	    hash = UNSET_HASH;
-	}
-    } while (0);
-
-    pathrelse (&path);
-    return hash;
+	int retval;
+	struct inode *inode;
+	struct cpu_key key;
+	INITIALIZE_PATH(path);
+	struct reiserfs_dir_entry de;
+	__u32 hash = DEFAULT_HASH;
+
+	inode = s->s_root->d_inode;
+
+	do {			// Some serious "goto"-hater was there ;)
+		u32 teahash, r5hash, yurahash;
+
+		make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3);
+		retval = search_by_entry_key(s, &key, &path, &de);
+		if (retval == IO_ERROR) {
+			pathrelse(&path);
+			return UNSET_HASH;
+		}
+		if (retval == NAME_NOT_FOUND)
+			de.de_entry_num--;
+		set_de_name_and_namelen(&de);
+		if (deh_offset(&(de.de_deh[de.de_entry_num])) == DOT_DOT_OFFSET) {
+			/* allow override in this case */
+			if (reiserfs_rupasov_hash(s)) {
+				hash = YURA_HASH;
+			}
+			reiserfs_warning(s, "FS seems to be empty, autodetect "
+					 "is using the default hash");
+			break;
+		}
+		r5hash = GET_HASH_VALUE(r5_hash(de.de_name, de.de_namelen));
+		teahash = GET_HASH_VALUE(keyed_hash(de.de_name, de.de_namelen));
+		yurahash = GET_HASH_VALUE(yura_hash(de.de_name, de.de_namelen));
+		if (((teahash == r5hash)
+		     &&
+		     (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num])))
+		      == r5hash)) || ((teahash == yurahash)
+				      && (yurahash ==
+					  GET_HASH_VALUE(deh_offset
+							 (&
+							  (de.
+							   de_deh[de.
+								  de_entry_num])))))
+		    || ((r5hash == yurahash)
+			&& (yurahash ==
+			    GET_HASH_VALUE(deh_offset
+					   (&(de.de_deh[de.de_entry_num])))))) {
+			reiserfs_warning(s,
+					 "Unable to automatically detect hash function. "
+					 "Please mount with -o hash={tea,rupasov,r5}",
+					 reiserfs_bdevname(s));
+			hash = UNSET_HASH;
+			break;
+		}
+		if (GET_HASH_VALUE(deh_offset(&(de.de_deh[de.de_entry_num]))) ==
+		    yurahash)
+			hash = YURA_HASH;
+		else if (GET_HASH_VALUE
+			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == teahash)
+			hash = TEA_HASH;
+		else if (GET_HASH_VALUE
+			 (deh_offset(&(de.de_deh[de.de_entry_num]))) == r5hash)
+			hash = R5_HASH;
+		else {
+			reiserfs_warning(s, "Unrecognised hash function");
+			hash = UNSET_HASH;
+		}
+	} while (0);
+
+	pathrelse(&path);
+	return hash;
 }
 
 // finds out which hash names are sorted with
-static int what_hash (struct super_block * s)
+static int what_hash(struct super_block *s)
 {
-    __u32 code;
-
-    code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
-
-    /* reiserfs_hash_detect() == true if any of the hash mount options
-    ** were used.  We must check them to make sure the user isn't
-    ** using a bad hash value
-    */
-    if (code == UNSET_HASH || reiserfs_hash_detect(s))
-	code = find_hash_out (s);
-
-    if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
-	/* detection has found the hash, and we must check against the 
-	** mount options 
-	*/
-	if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
-	    reiserfs_warning (s, "Error, %s hash detected, "
-		   "unable to force rupasov hash", reiserfs_hashname(code)) ;
-	    code = UNSET_HASH ;
-	} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
-	    reiserfs_warning (s, "Error, %s hash detected, "
-		   "unable to force tea hash", reiserfs_hashname(code)) ;
-	    code = UNSET_HASH ;
-	} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
-	    reiserfs_warning (s, "Error, %s hash detected, "
-		   "unable to force r5 hash", reiserfs_hashname(code)) ;
-	    code = UNSET_HASH ;
-	} 
-    } else { 
-        /* find_hash_out was not called or could not determine the hash */
-	if (reiserfs_rupasov_hash(s)) {
-	    code = YURA_HASH ;
-	} else if (reiserfs_tea_hash(s)) {
-	    code = TEA_HASH ;
-	} else if (reiserfs_r5_hash(s)) {
-	    code = R5_HASH ;
-	} 
-    }
-
-    /* if we are mounted RW, and we have a new valid hash code, update 
-    ** the super
-    */
-    if (code != UNSET_HASH && 
-	!(s->s_flags & MS_RDONLY) && 
-        code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
-        set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
-    }
-    return code;
+	__u32 code;
+
+	code = sb_hash_function_code(SB_DISK_SUPER_BLOCK(s));
+
+	/* reiserfs_hash_detect() == true if any of the hash mount options
+	 ** were used.  We must check them to make sure the user isn't
+	 ** using a bad hash value
+	 */
+	if (code == UNSET_HASH || reiserfs_hash_detect(s))
+		code = find_hash_out(s);
+
+	if (code != UNSET_HASH && reiserfs_hash_detect(s)) {
+		/* detection has found the hash, and we must check against the 
+		 ** mount options 
+		 */
+		if (reiserfs_rupasov_hash(s) && code != YURA_HASH) {
+			reiserfs_warning(s, "Error, %s hash detected, "
+					 "unable to force rupasov hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_tea_hash(s) && code != TEA_HASH) {
+			reiserfs_warning(s, "Error, %s hash detected, "
+					 "unable to force tea hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		} else if (reiserfs_r5_hash(s) && code != R5_HASH) {
+			reiserfs_warning(s, "Error, %s hash detected, "
+					 "unable to force r5 hash",
+					 reiserfs_hashname(code));
+			code = UNSET_HASH;
+		}
+	} else {
+		/* find_hash_out was not called or could not determine the hash */
+		if (reiserfs_rupasov_hash(s)) {
+			code = YURA_HASH;
+		} else if (reiserfs_tea_hash(s)) {
+			code = TEA_HASH;
+		} else if (reiserfs_r5_hash(s)) {
+			code = R5_HASH;
+		}
+	}
+
+	/* if we are mounted RW, and we have a new valid hash code, update 
+	 ** the super
+	 */
+	if (code != UNSET_HASH &&
+	    !(s->s_flags & MS_RDONLY) &&
+	    code != sb_hash_function_code(SB_DISK_SUPER_BLOCK(s))) {
+		set_sb_hash_function_code(SB_DISK_SUPER_BLOCK(s), code);
+	}
+	return code;
 }
 
 // return pointer to appropriate function
-static hashf_t hash_function (struct super_block * s)
+static hashf_t hash_function(struct super_block *s)
 {
-    switch (what_hash (s)) {
-    case TEA_HASH:
-	reiserfs_info (s, "Using tea hash to sort names\n");
-	return keyed_hash;
-    case YURA_HASH:
-	reiserfs_info (s, "Using rupasov hash to sort names\n");
-	return yura_hash;
-    case R5_HASH:
-	reiserfs_info (s, "Using r5 hash to sort names\n");
-	return r5_hash;
-    }
-    return NULL;
+	switch (what_hash(s)) {
+	case TEA_HASH:
+		reiserfs_info(s, "Using tea hash to sort names\n");
+		return keyed_hash;
+	case YURA_HASH:
+		reiserfs_info(s, "Using rupasov hash to sort names\n");
+		return yura_hash;
+	case R5_HASH:
+		reiserfs_info(s, "Using r5 hash to sort names\n");
+		return r5_hash;
+	}
+	return NULL;
 }
 
 // this is used to set up correct value for old partitions
-static int function2code (hashf_t func)
+static int function2code(hashf_t func)
 {
-    if (func == keyed_hash)
-	return TEA_HASH;
-    if (func == yura_hash)
-	return YURA_HASH;
-    if (func == r5_hash)
-	return R5_HASH;
+	if (func == keyed_hash)
+		return TEA_HASH;
+	if (func == yura_hash)
+		return YURA_HASH;
+	if (func == r5_hash)
+		return R5_HASH;
 
-    BUG() ; // should never happen
+	BUG();			// should never happen
 
-    return 0;
+	return 0;
 }
 
 #define SWARN(silent, s, ...)			\
 	if (!(silent))				\
 		reiserfs_warning (s, __VA_ARGS__)
 
-static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
+static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
 {
-    struct inode *root_inode;
-    int j;
-    struct reiserfs_transaction_handle th ;
-    int old_format = 0;
-    unsigned long blocks;
-    unsigned int commit_max_age = 0;
-    int jinit_done = 0 ;
-    struct reiserfs_iget_args args ;
-    struct reiserfs_super_block * rs;
-    char *jdev_name;
-    struct reiserfs_sb_info *sbi;
-    int errval = -EINVAL;
-
-    sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
-    if (!sbi) {
-	errval = -ENOMEM;
-	goto error;
-    }
-    s->s_fs_info = sbi;
-    memset (sbi, 0, sizeof (struct reiserfs_sb_info));
-    /* Set default values for options: non-aggressive tails, RO on errors */
-    REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
-    REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-    /* no preallocation minimum, be smart in
-       reiserfs_file_write instead */
-    REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
-    /* Preallocate by 16 blocks (17-1) at once */
-    REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
-    /* Initialize the rwsem for xattr dir */
-    init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
-
-    /* setup default block allocator options */
-    reiserfs_init_alloc_options(s);
-
-    jdev_name = NULL;
-    if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) {
-	goto error;
-    }
-
-    if (blocks) {
-	SWARN (silent, s, "jmacd-7: reiserfs_fill_super: resize option "
-	       "for remount only");
-	goto error;
-    }	
-
-    /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
-    if (!read_super_block (s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
-      old_format = 1;
-    /* try new format (64-th 1k block), which can contain reiserfs super block */
-    else if (read_super_block (s, REISERFS_DISK_OFFSET_IN_BYTES)) {
-      SWARN(silent, s, "sh-2021: reiserfs_fill_super: can not find reiserfs on %s", reiserfs_bdevname (s));
-      goto error;
-    }
-
-    rs = SB_DISK_SUPER_BLOCK (s);
-    /* Let's do basic sanity check to verify that underlying device is not
-       smaller than the filesystem. If the check fails then abort and scream,
-       because bad stuff will happen otherwise. */
-    if ( s->s_bdev && s->s_bdev->bd_inode && i_size_read(s->s_bdev->bd_inode) < sb_block_count(rs)*sb_blocksize(rs)) {
-	SWARN (silent, s, "Filesystem on %s cannot be mounted because it is bigger than the device", reiserfs_bdevname(s));
-	SWARN(silent, s, "You may need to run fsck or increase size of your LVM partition");
-	SWARN(silent, s, "Or may be you forgot to reboot after fdisk when it told you to");
-	goto error;
-    }
-
-    sbi->s_mount_state = SB_REISERFS_STATE(s);
-    sbi->s_mount_state = REISERFS_VALID_FS ;
-
-    if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) {
-	SWARN(silent, s, "jmacd-8: reiserfs_fill_super: unable to read bitmap");
-	goto error;
-    }
+	struct inode *root_inode;
+	int j;
+	struct reiserfs_transaction_handle th;
+	int old_format = 0;
+	unsigned long blocks;
+	unsigned int commit_max_age = 0;
+	int jinit_done = 0;
+	struct reiserfs_iget_args args;
+	struct reiserfs_super_block *rs;
+	char *jdev_name;
+	struct reiserfs_sb_info *sbi;
+	int errval = -EINVAL;
+
+	sbi = kmalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
+	if (!sbi) {
+		errval = -ENOMEM;
+		goto error;
+	}
+	s->s_fs_info = sbi;
+	memset(sbi, 0, sizeof(struct reiserfs_sb_info));
+	/* Set default values for options: non-aggressive tails, RO on errors */
+	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+	REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+	/* no preallocation minimum, be smart in
+	   reiserfs_file_write instead */
+	REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
+	/* Preallocate by 16 blocks (17-1) at once */
+	REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+	/* Initialize the rwsem for xattr dir */
+	init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
+
+	/* setup default block allocator options */
+	reiserfs_init_alloc_options(s);
+
+	jdev_name = NULL;
+	if (reiserfs_parse_options
+	    (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
+	     &commit_max_age) == 0) {
+		goto error;
+	}
+
+	if (blocks) {
+		SWARN(silent, s, "jmacd-7: reiserfs_fill_super: resize option "
+		      "for remount only");
+		goto error;
+	}
+
+	/* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */
+	if (!read_super_block(s, REISERFS_OLD_DISK_OFFSET_IN_BYTES))
+		old_format = 1;
+	/* try new format (64-th 1k block), which can contain reiserfs super block */
+	else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) {
+		SWARN(silent, s,
+		      "sh-2021: reiserfs_fill_super: can not find reiserfs on %s",
+		      reiserfs_bdevname(s));
+		goto error;
+	}
+
+	rs = SB_DISK_SUPER_BLOCK(s);
+	/* Let's do basic sanity check to verify that underlying device is not
+	   smaller than the filesystem. If the check fails then abort and scream,
+	   because bad stuff will happen otherwise. */
+	if (s->s_bdev && s->s_bdev->bd_inode
+	    && i_size_read(s->s_bdev->bd_inode) <
+	    sb_block_count(rs) * sb_blocksize(rs)) {
+		SWARN(silent, s,
+		      "Filesystem on %s cannot be mounted because it is bigger than the device",
+		      reiserfs_bdevname(s));
+		SWARN(silent, s,
+		      "You may need to run fsck or increase size of your LVM partition");
+		SWARN(silent, s,
+		      "Or may be you forgot to reboot after fdisk when it told you to");
+		goto error;
+	}
+
+	sbi->s_mount_state = SB_REISERFS_STATE(s);
+	sbi->s_mount_state = REISERFS_VALID_FS;
+
+	if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) {
+		SWARN(silent, s,
+		      "jmacd-8: reiserfs_fill_super: unable to read bitmap");
+		goto error;
+	}
 #ifdef CONFIG_REISERFS_CHECK
-    SWARN (silent, s, "CONFIG_REISERFS_CHECK is set ON");
-    SWARN (silent, s, "- it is slow mode for debugging.");
+	SWARN(silent, s, "CONFIG_REISERFS_CHECK is set ON");
+	SWARN(silent, s, "- it is slow mode for debugging.");
 #endif
 
-    /* make data=ordered the default */
-    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
-        !reiserfs_data_writeback(s))
-    {
-         REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
-    }
-
-    if (reiserfs_data_log(s)) {
-        reiserfs_info (s, "using journaled data mode\n");
-    } else if (reiserfs_data_ordered(s)) {
-        reiserfs_info (s, "using ordered data mode\n");
-    } else {
-        reiserfs_info (s, "using writeback data mode\n");
-    }
-    if (reiserfs_barrier_flush(s)) {
-    	printk("reiserfs: using flush barriers\n");
-    }
-
-    // set_device_ro(s->s_dev, 1) ;
-    if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
-	SWARN(silent, s, "sh-2022: reiserfs_fill_super: unable to initialize journal space") ;
-	goto error ;
-    } else {
-	jinit_done = 1 ; /* once this is set, journal_release must be called
-			 ** if we error out of the mount
-			 */
-    }
-    if (reread_meta_blocks(s)) {
-	SWARN(silent, s, "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init") ;
-	goto error ;
-    }
-
-    if (replay_only (s))
-	goto error;
-
-    if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
-        SWARN(silent, s, "clm-7000: Detected readonly device, marking FS readonly") ;
-	s->s_flags |= MS_RDONLY ;
-    }
-    args.objectid = REISERFS_ROOT_OBJECTID ;
-    args.dirid = REISERFS_ROOT_PARENT_OBJECTID ;
-    root_inode = iget5_locked (s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor, reiserfs_init_locked_inode, (void *)(&args));
-    if (!root_inode) {
-	SWARN(silent, s, "jmacd-10: reiserfs_fill_super: get root inode failed");
-	goto error;
-    }
-
-    if (root_inode->i_state & I_NEW) {
-	reiserfs_read_locked_inode(root_inode, &args);
-	unlock_new_inode(root_inode);
-    }
-
-    s->s_root = d_alloc_root(root_inode);  
-    if (!s->s_root) {
-	iput(root_inode);
-	goto error;
-    }
-
-    // define and initialize hash function
-    sbi->s_hash_function = hash_function (s);
-    if (sbi->s_hash_function == NULL) {
-      dput(s->s_root) ;
-      s->s_root = NULL ;
-      goto error ;
-    }
-
-    if (is_reiserfs_3_5 (rs) || (is_reiserfs_jr (rs) && SB_VERSION (s) == REISERFS_VERSION_1))
-	set_bit(REISERFS_3_5, &(sbi->s_properties));
-    else
-	set_bit(REISERFS_3_6, &(sbi->s_properties));
-    
-    if (!(s->s_flags & MS_RDONLY)) {
-
-	errval = journal_begin(&th, s, 1) ;
-        if (errval) {
-	    dput (s->s_root);
-	    s->s_root = NULL;
-	    goto error;
-        }
-	reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
-
-        set_sb_umount_state( rs, REISERFS_ERROR_FS );
-	set_sb_fs_state (rs, 0);
-	
-	if (old_format_only(s)) {
-	  /* filesystem of format 3.5 either with standard or non-standard
-	     journal */
-	  if (convert_reiserfs (s)) {
-	    /* and -o conv is given */
-	    if(!silent)
-	      reiserfs_info (s,"converting 3.5 filesystem to the 3.6 format") ;
-
-	    if (is_reiserfs_3_5 (rs))
-	      /* put magic string of 3.6 format. 2.2 will not be able to
-		 mount this filesystem anymore */
-	      memcpy (rs->s_v1.s_magic, reiserfs_3_6_magic_string,
-		      sizeof (reiserfs_3_6_magic_string));
-
-	    set_sb_version(rs,REISERFS_VERSION_2);
-	    reiserfs_convert_objectid_map_v1(s) ;
-	    set_bit(REISERFS_3_6, &(sbi->s_properties));
-	    clear_bit(REISERFS_3_5, &(sbi->s_properties));
-	  } else if (!silent){
-	    reiserfs_info (s, "using 3.5.x disk format\n") ;
-	  }
-	}
-
-	journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-	errval = journal_end(&th, s, 1) ;
-	if (errval) {
-	    dput (s->s_root);
-	    s->s_root = NULL;
-	    goto error;
-	}
-
-	if ((errval = reiserfs_xattr_init (s, s->s_flags))) {
-	    dput (s->s_root);
-	    s->s_root = NULL;
-	    goto error;
-	}
-
-	/* look for files which were to be removed in previous session */
-	finish_unfinished (s);
-    } else {
-	if ( old_format_only(s) && !silent) {
-	    reiserfs_info (s, "using 3.5.x disk format\n") ;
-	}
-
-	if ((errval = reiserfs_xattr_init (s, s->s_flags))) {
-	    dput (s->s_root);
-	    s->s_root = NULL;
-	    goto error;
-	}
-    }
-    // mark hash in super block: it could be unset. overwrite should be ok
-    set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) );
-
-    handle_attrs( s );
-
-    reiserfs_proc_info_init( s );
-
-    init_waitqueue_head (&(sbi->s_wait));
-    spin_lock_init(&sbi->bitmap_lock);
-
-    return (0);
-
- error:
-    if (jinit_done) { /* kill the commit thread, free journal ram */
-	journal_release_error(NULL, s) ;
-    }
-    if (SB_DISK_SUPER_BLOCK (s)) {
-	for (j = 0; j < SB_BMAP_NR (s); j ++) {
-	    if (SB_AP_BITMAP (s))
-		brelse (SB_AP_BITMAP (s)[j].bh);
-	}
-	if (SB_AP_BITMAP (s))
-	    vfree (SB_AP_BITMAP (s));
-    }
-    if (SB_BUFFER_WITH_SB (s))
-	brelse(SB_BUFFER_WITH_SB (s));
+	/* make data=ordered the default */
+	if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+	    !reiserfs_data_writeback(s)) {
+		REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+	}
+
+	if (reiserfs_data_log(s)) {
+		reiserfs_info(s, "using journaled data mode\n");
+	} else if (reiserfs_data_ordered(s)) {
+		reiserfs_info(s, "using ordered data mode\n");
+	} else {
+		reiserfs_info(s, "using writeback data mode\n");
+	}
+	if (reiserfs_barrier_flush(s)) {
+		printk("reiserfs: using flush barriers\n");
+	}
+	// set_device_ro(s->s_dev, 1) ;
+	if (journal_init(s, jdev_name, old_format, commit_max_age)) {
+		SWARN(silent, s,
+		      "sh-2022: reiserfs_fill_super: unable to initialize journal space");
+		goto error;
+	} else {
+		jinit_done = 1;	/* once this is set, journal_release must be called
+				 ** if we error out of the mount
+				 */
+	}
+	if (reread_meta_blocks(s)) {
+		SWARN(silent, s,
+		      "jmacd-9: reiserfs_fill_super: unable to reread meta blocks after journal init");
+		goto error;
+	}
+
+	if (replay_only(s))
+		goto error;
+
+	if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) {
+		SWARN(silent, s,
+		      "clm-7000: Detected readonly device, marking FS readonly");
+		s->s_flags |= MS_RDONLY;
+	}
+	args.objectid = REISERFS_ROOT_OBJECTID;
+	args.dirid = REISERFS_ROOT_PARENT_OBJECTID;
+	root_inode =
+	    iget5_locked(s, REISERFS_ROOT_OBJECTID, reiserfs_find_actor,
+			 reiserfs_init_locked_inode, (void *)(&args));
+	if (!root_inode) {
+		SWARN(silent, s,
+		      "jmacd-10: reiserfs_fill_super: get root inode failed");
+		goto error;
+	}
+
+	if (root_inode->i_state & I_NEW) {
+		reiserfs_read_locked_inode(root_inode, &args);
+		unlock_new_inode(root_inode);
+	}
+
+	s->s_root = d_alloc_root(root_inode);
+	if (!s->s_root) {
+		iput(root_inode);
+		goto error;
+	}
+	// define and initialize hash function
+	sbi->s_hash_function = hash_function(s);
+	if (sbi->s_hash_function == NULL) {
+		dput(s->s_root);
+		s->s_root = NULL;
+		goto error;
+	}
+
+	if (is_reiserfs_3_5(rs)
+	    || (is_reiserfs_jr(rs) && SB_VERSION(s) == REISERFS_VERSION_1))
+		set_bit(REISERFS_3_5, &(sbi->s_properties));
+	else
+		set_bit(REISERFS_3_6, &(sbi->s_properties));
+
+	if (!(s->s_flags & MS_RDONLY)) {
+
+		errval = journal_begin(&th, s, 1);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+		reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
+
+		set_sb_umount_state(rs, REISERFS_ERROR_FS);
+		set_sb_fs_state(rs, 0);
+
+		if (old_format_only(s)) {
+			/* filesystem of format 3.5 either with standard or non-standard
+			   journal */
+			if (convert_reiserfs(s)) {
+				/* and -o conv is given */
+				if (!silent)
+					reiserfs_info(s,
+						      "converting 3.5 filesystem to the 3.6 format");
+
+				if (is_reiserfs_3_5(rs))
+					/* put magic string of 3.6 format. 2.2 will not be able to
+					   mount this filesystem anymore */
+					memcpy(rs->s_v1.s_magic,
+					       reiserfs_3_6_magic_string,
+					       sizeof
+					       (reiserfs_3_6_magic_string));
+
+				set_sb_version(rs, REISERFS_VERSION_2);
+				reiserfs_convert_objectid_map_v1(s);
+				set_bit(REISERFS_3_6, &(sbi->s_properties));
+				clear_bit(REISERFS_3_5, &(sbi->s_properties));
+			} else if (!silent) {
+				reiserfs_info(s, "using 3.5.x disk format\n");
+			}
+		}
+
+		journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
+		errval = journal_end(&th, s, 1);
+		if (errval) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+
+		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+
+		/* look for files which were to be removed in previous session */
+		finish_unfinished(s);
+	} else {
+		if (old_format_only(s) && !silent) {
+			reiserfs_info(s, "using 3.5.x disk format\n");
+		}
+
+		if ((errval = reiserfs_xattr_init(s, s->s_flags))) {
+			dput(s->s_root);
+			s->s_root = NULL;
+			goto error;
+		}
+	}
+	// mark hash in super block: it could be unset. overwrite should be ok
+	set_sb_hash_function_code(rs, function2code(sbi->s_hash_function));
+
+	handle_attrs(s);
+
+	reiserfs_proc_info_init(s);
+
+	init_waitqueue_head(&(sbi->s_wait));
+	spin_lock_init(&sbi->bitmap_lock);
+
+	return (0);
+
+      error:
+	if (jinit_done) {	/* kill the commit thread, free journal ram */
+		journal_release_error(NULL, s);
+	}
+	if (SB_DISK_SUPER_BLOCK(s)) {
+		for (j = 0; j < SB_BMAP_NR(s); j++) {
+			if (SB_AP_BITMAP(s))
+				brelse(SB_AP_BITMAP(s)[j].bh);
+		}
+		if (SB_AP_BITMAP(s))
+			vfree(SB_AP_BITMAP(s));
+	}
+	if (SB_BUFFER_WITH_SB(s))
+		brelse(SB_BUFFER_WITH_SB(s));
 #ifdef CONFIG_QUOTA
-    for (j = 0; j < MAXQUOTAS; j++) {
-	if (sbi->s_qf_names[j])
-	    kfree(sbi->s_qf_names[j]);
-    }
+	for (j = 0; j < MAXQUOTAS; j++) {
+		if (sbi->s_qf_names[j])
+			kfree(sbi->s_qf_names[j]);
+	}
 #endif
-    if (sbi != NULL) {
-	kfree(sbi);
-    }
+	if (sbi != NULL) {
+		kfree(sbi);
+	}
 
-    s->s_fs_info = NULL;
-    return errval;
+	s->s_fs_info = NULL;
+	return errval;
 }
 
-
-static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf)
+static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf)
 {
-  struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s);
-  
-  buf->f_namelen = (REISERFS_MAX_NAME (s->s_blocksize));
-  buf->f_bfree   = sb_free_blocks(rs);
-  buf->f_bavail  = buf->f_bfree;
-  buf->f_blocks  = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
-  buf->f_bsize   = s->s_blocksize;
-  /* changed to accommodate gcc folks.*/
-  buf->f_type    =  REISERFS_SUPER_MAGIC;
-  return 0;
+	struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s);
+
+	buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize));
+	buf->f_bfree = sb_free_blocks(rs);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1;
+	buf->f_bsize = s->s_blocksize;
+	/* changed to accommodate gcc folks. */
+	buf->f_type = REISERFS_SUPER_MAGIC;
+	return 0;
 }
 
 #ifdef CONFIG_QUOTA
 static int reiserfs_dquot_initialize(struct inode *inode, int type)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    /* We may create quota structure so we need to reserve enough blocks */
-    reiserfs_write_lock(inode->i_sb);
-    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-    if (ret)
-	goto out;
-    ret = dquot_initialize(inode, type);
-    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(inode->i_sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	/* We may create quota structure so we need to reserve enough blocks */
+	reiserfs_write_lock(inode->i_sb);
+	ret =
+	    journal_begin(&th, inode->i_sb,
+			  2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
+	if (ret)
+		goto out;
+	ret = dquot_initialize(inode, type);
+	err =
+	    journal_end(&th, inode->i_sb,
+			2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(inode->i_sb);
+	return ret;
 }
 
 static int reiserfs_dquot_drop(struct inode *inode)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    /* We may delete quota structure so we need to reserve enough blocks */
-    reiserfs_write_lock(inode->i_sb);
-    ret = journal_begin(&th, inode->i_sb, 2*REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-    if (ret)
- 	goto out;
-    ret = dquot_drop(inode);
-    err = journal_end(&th, inode->i_sb, 2*REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(inode->i_sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	/* We may delete quota structure so we need to reserve enough blocks */
+	reiserfs_write_lock(inode->i_sb);
+	ret =
+	    journal_begin(&th, inode->i_sb,
+			  2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
+	if (ret)
+		goto out;
+	ret = dquot_drop(inode);
+	err =
+	    journal_end(&th, inode->i_sb,
+			2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(inode->i_sb);
+	return ret;
 }
 
 static int reiserfs_write_dquot(struct dquot *dquot)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-    if (ret)
-	goto out;
-    ret = dquot_commit(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(dquot->dq_sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	ret = dquot_commit(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
 }
 
 static int reiserfs_acquire_dquot(struct dquot *dquot)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
-    if (ret)
-	goto out;
-    ret = dquot_acquire(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(dquot->dq_sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	ret = dquot_acquire(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
 }
 
 static int reiserfs_release_dquot(struct dquot *dquot)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    reiserfs_write_lock(dquot->dq_sb);
-    ret = journal_begin(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-    if (ret)
- 	goto out;
-    ret = dquot_release(dquot);
-    err = journal_end(&th, dquot->dq_sb, REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(dquot->dq_sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	reiserfs_write_lock(dquot->dq_sb);
+	ret =
+	    journal_begin(&th, dquot->dq_sb,
+			  REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (ret)
+		goto out;
+	ret = dquot_release(dquot);
+	err =
+	    journal_end(&th, dquot->dq_sb,
+			REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(dquot->dq_sb);
+	return ret;
 }
 
 static int reiserfs_mark_dquot_dirty(struct dquot *dquot)
 {
-    /* Are we journalling quotas? */
-    if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-        REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
-	dquot_mark_dquot_dirty(dquot);
-	return reiserfs_write_dquot(dquot);
-    }
-    else
-	return dquot_mark_dquot_dirty(dquot);
+	/* Are we journalling quotas? */
+	if (REISERFS_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
+	    REISERFS_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
+		dquot_mark_dquot_dirty(dquot);
+		return reiserfs_write_dquot(dquot);
+	} else
+		return dquot_mark_dquot_dirty(dquot);
 }
 
 static int reiserfs_write_info(struct super_block *sb, int type)
 {
-    struct reiserfs_transaction_handle th;
-    int ret, err;
-
-    /* Data block + inode block */
-    reiserfs_write_lock(sb);
-    ret = journal_begin(&th, sb, 2);
-    if (ret)
-	goto out;
-    ret = dquot_commit_info(sb, type);
-    err = journal_end(&th, sb, 2);
-    if (!ret && err)
-	ret = err;
-out:
-    reiserfs_write_unlock(sb);
-    return ret;
+	struct reiserfs_transaction_handle th;
+	int ret, err;
+
+	/* Data block + inode block */
+	reiserfs_write_lock(sb);
+	ret = journal_begin(&th, sb, 2);
+	if (ret)
+		goto out;
+	ret = dquot_commit_info(sb, type);
+	err = journal_end(&th, sb, 2);
+	if (!ret && err)
+		ret = err;
+      out:
+	reiserfs_write_unlock(sb);
+	return ret;
 }
 
 /*
@@ -1977,45 +2116,48 @@ out:
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
 	return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-			REISERFS_SB(sb)->s_jquota_fmt, type);
+				  REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
  * Standard function to be called on quota_on
  */
-static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
+static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
+			     char *path)
 {
-    int err;
-    struct nameidata nd;
-
-    if (!(REISERFS_SB(sb)->s_mount_opt & (1<<REISERFS_QUOTA)))
-	return -EINVAL;
-    err = path_lookup(path, LOOKUP_FOLLOW, &nd);
-    if (err)
-        return err;
-    /* Quotafile not on the same filesystem? */
-    if (nd.mnt->mnt_sb != sb) {
-	path_release(&nd);
-        return -EXDEV;
-    }
-    /* We must not pack tails for quota files on reiserfs for quota IO to work */
-    if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) {
-	reiserfs_warning(sb, "reiserfs: Quota file must have tail packing disabled.");
-	path_release(&nd);
-	return -EINVAL;
-    }
-    /* Not journalling quota? No more tests needed... */
-    if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
-        !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
+	int err;
+	struct nameidata nd;
+
+	if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
+		return -EINVAL;
+	err = path_lookup(path, LOOKUP_FOLLOW, &nd);
+	if (err)
+		return err;
+	/* Quotafile not on the same filesystem? */
+	if (nd.mnt->mnt_sb != sb) {
+		path_release(&nd);
+		return -EXDEV;
+	}
+	/* We must not pack tails for quota files on reiserfs for quota IO to work */
+	if (!REISERFS_I(nd.dentry->d_inode)->i_flags & i_nopack_mask) {
+		reiserfs_warning(sb,
+				 "reiserfs: Quota file must have tail packing disabled.");
+		path_release(&nd);
+		return -EINVAL;
+	}
+	/* Not journalling quota? No more tests needed... */
+	if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] &&
+	    !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) {
+		path_release(&nd);
+		return vfs_quota_on(sb, type, format_id, path);
+	}
+	/* Quotafile not of fs root? */
+	if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
+		reiserfs_warning(sb,
+				 "reiserfs: Quota file not on filesystem root. "
+				 "Journalled quota will not work.");
 	path_release(&nd);
-        return vfs_quota_on(sb, type, format_id, path);
-    }
-    /* Quotafile not of fs root? */
-    if (nd.dentry->d_parent->d_inode != sb->s_root->d_inode)
-	reiserfs_warning(sb, "reiserfs: Quota file not on filesystem root. "
-                             "Journalled quota will not work.");
-    path_release(&nd);
-    return vfs_quota_on(sb, type, format_id, path);
+	return vfs_quota_on(sb, type, format_id, path);
 }
 
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
@@ -2025,42 +2167,44 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, ch
 static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
 				   size_t len, loff_t off)
 {
-    struct inode *inode = sb_dqopt(sb)->files[type];
-    unsigned long blk = off >> sb->s_blocksize_bits;
-    int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-    size_t toread;
-    struct buffer_head tmp_bh, *bh;
-    loff_t i_size = i_size_read(inode);
-
-    if (off > i_size)
-	return 0;
-    if (off+len > i_size)
-	len = i_size-off;
-    toread = len;
-    while (toread > 0) {
-	tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread;
-	tmp_bh.b_state = 0;
-	/* Quota files are without tails so we can safely use this function */
-	reiserfs_write_lock(sb);
-	err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
-	reiserfs_write_unlock(sb);
-	if (err)
-	    return err;
-	if (!buffer_mapped(&tmp_bh))    /* A hole? */
-	    memset(data, 0, tocopy);
-	else {
-	    bh = sb_bread(sb, tmp_bh.b_blocknr);
-	    if (!bh)
-		return -EIO;
-	    memcpy(data, bh->b_data+offset, tocopy);
-	    brelse(bh);
-	}
-	offset = 0;
-	toread -= tocopy;
-	data += tocopy;
-	blk++;
-    }
-    return len;
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	size_t toread;
+	struct buffer_head tmp_bh, *bh;
+	loff_t i_size = i_size_read(inode);
+
+	if (off > i_size)
+		return 0;
+	if (off + len > i_size)
+		len = i_size - off;
+	toread = len;
+	while (toread > 0) {
+		tocopy =
+		    sb->s_blocksize - offset <
+		    toread ? sb->s_blocksize - offset : toread;
+		tmp_bh.b_state = 0;
+		/* Quota files are without tails so we can safely use this function */
+		reiserfs_write_lock(sb);
+		err = reiserfs_get_block(inode, blk, &tmp_bh, 0);
+		reiserfs_write_unlock(sb);
+		if (err)
+			return err;
+		if (!buffer_mapped(&tmp_bh))	/* A hole? */
+			memset(data, 0, tocopy);
+		else {
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			if (!bh)
+				return -EIO;
+			memcpy(data, bh->b_data + offset, tocopy);
+			brelse(bh);
+		}
+		offset = 0;
+		toread -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+	return len;
 }
 
 /* Write to quotafile (we know the transaction is already started and has
@@ -2068,117 +2212,116 @@ static ssize_t reiserfs_quota_read(struct super_block *sb, int type, char *data,
 static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
 				    const char *data, size_t len, loff_t off)
 {
-    struct inode *inode = sb_dqopt(sb)->files[type];
-    unsigned long blk = off >> sb->s_blocksize_bits;
-    int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
-    int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
-    size_t towrite = len;
-    struct buffer_head tmp_bh, *bh;
-
-    down(&inode->i_sem);
-    while (towrite > 0) {
-	tocopy = sb->s_blocksize - offset < towrite ?
-	         sb->s_blocksize - offset : towrite;
-	tmp_bh.b_state = 0;
-	err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
-	if (err)
-	    goto out;
-	if (offset || tocopy != sb->s_blocksize)
-	    bh = sb_bread(sb, tmp_bh.b_blocknr);
-	else
-	    bh = sb_getblk(sb, tmp_bh.b_blocknr);
-	if (!bh) {
-	    err = -EIO;
-	    goto out;
-	}
-	lock_buffer(bh);
-	memcpy(bh->b_data+offset, data, tocopy);
-	flush_dcache_page(bh->b_page);
-	set_buffer_uptodate(bh);
-	unlock_buffer(bh);
-	reiserfs_prepare_for_journal(sb, bh, 1);
-	journal_mark_dirty(current->journal_info, sb, bh);
-	if (!journal_quota)
-		reiserfs_add_ordered_list(inode, bh);
-	brelse(bh);
-	offset = 0;
-	towrite -= tocopy;
-	data += tocopy;
-	blk++;
-    }
-out:
-    if (len == towrite)
-	return err;
-    if (inode->i_size < off+len-towrite)
-	i_size_write(inode, off+len-towrite);
-    inode->i_version++;
-    inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-    mark_inode_dirty(inode);
-    up(&inode->i_sem);
-    return len - towrite;
+	struct inode *inode = sb_dqopt(sb)->files[type];
+	unsigned long blk = off >> sb->s_blocksize_bits;
+	int err = 0, offset = off & (sb->s_blocksize - 1), tocopy;
+	int journal_quota = REISERFS_SB(sb)->s_qf_names[type] != NULL;
+	size_t towrite = len;
+	struct buffer_head tmp_bh, *bh;
+
+	down(&inode->i_sem);
+	while (towrite > 0) {
+		tocopy = sb->s_blocksize - offset < towrite ?
+		    sb->s_blocksize - offset : towrite;
+		tmp_bh.b_state = 0;
+		err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+		if (err)
+			goto out;
+		if (offset || tocopy != sb->s_blocksize)
+			bh = sb_bread(sb, tmp_bh.b_blocknr);
+		else
+			bh = sb_getblk(sb, tmp_bh.b_blocknr);
+		if (!bh) {
+			err = -EIO;
+			goto out;
+		}
+		lock_buffer(bh);
+		memcpy(bh->b_data + offset, data, tocopy);
+		flush_dcache_page(bh->b_page);
+		set_buffer_uptodate(bh);
+		unlock_buffer(bh);
+		reiserfs_prepare_for_journal(sb, bh, 1);
+		journal_mark_dirty(current->journal_info, sb, bh);
+		if (!journal_quota)
+			reiserfs_add_ordered_list(inode, bh);
+		brelse(bh);
+		offset = 0;
+		towrite -= tocopy;
+		data += tocopy;
+		blk++;
+	}
+      out:
+	if (len == towrite)
+		return err;
+	if (inode->i_size < off + len - towrite)
+		i_size_write(inode, off + len - towrite);
+	inode->i_version++;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	up(&inode->i_sem);
+	return len - towrite;
 }
 
 #endif
 
-static struct super_block*
-get_super_block (struct file_system_type *fs_type, int flags,
-		 const char *dev_name, void *data)
+static struct super_block *get_super_block(struct file_system_type *fs_type,
+					   int flags, const char *dev_name,
+					   void *data)
 {
 	return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super);
 }
 
-static int __init
-init_reiserfs_fs ( void )
+static int __init init_reiserfs_fs(void)
 {
 	int ret;
 
-	if ((ret = init_inodecache ())) {
+	if ((ret = init_inodecache())) {
 		return ret;
 	}
 
-        if ((ret = reiserfs_xattr_register_handlers ()))
-            goto failed_reiserfs_xattr_register_handlers;
+	if ((ret = reiserfs_xattr_register_handlers()))
+		goto failed_reiserfs_xattr_register_handlers;
 
-	reiserfs_proc_info_global_init ();
-	reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc);
+	reiserfs_proc_info_global_init();
+	reiserfs_proc_register_global("version",
+				      reiserfs_global_version_in_proc);
 
-        ret = register_filesystem (& reiserfs_fs_type);
+	ret = register_filesystem(&reiserfs_fs_type);
 
 	if (ret == 0) {
 		return 0;
 	}
 
-        reiserfs_xattr_unregister_handlers ();
+	reiserfs_xattr_unregister_handlers();
 
-failed_reiserfs_xattr_register_handlers:
-	reiserfs_proc_unregister_global ("version");
-	reiserfs_proc_info_global_done ();
-	destroy_inodecache ();
+      failed_reiserfs_xattr_register_handlers:
+	reiserfs_proc_unregister_global("version");
+	reiserfs_proc_info_global_done();
+	destroy_inodecache();
 
 	return ret;
 }
 
-static void __exit
-exit_reiserfs_fs ( void )
+static void __exit exit_reiserfs_fs(void)
 {
-        reiserfs_xattr_unregister_handlers ();
-	reiserfs_proc_unregister_global ("version");
-	reiserfs_proc_info_global_done ();
-        unregister_filesystem (& reiserfs_fs_type);
-	destroy_inodecache ();
+	reiserfs_xattr_unregister_handlers();
+	reiserfs_proc_unregister_global("version");
+	reiserfs_proc_info_global_done();
+	unregister_filesystem(&reiserfs_fs_type);
+	destroy_inodecache();
 }
 
 struct file_system_type reiserfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "reiserfs",
-	.get_sb		= get_super_block,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner = THIS_MODULE,
+	.name = "reiserfs",
+	.get_sb = get_super_block,
+	.kill_sb = kill_block_super,
+	.fs_flags = FS_REQUIRES_DEV,
 };
 
-MODULE_DESCRIPTION ("ReiserFS journaled filesystem");
-MODULE_AUTHOR      ("Hans Reiser <reiser@namesys.com>");
-MODULE_LICENSE     ("GPL");
+MODULE_DESCRIPTION("ReiserFS journaled filesystem");
+MODULE_AUTHOR("Hans Reiser <reiser@namesys.com>");
+MODULE_LICENSE("GPL");
 
-module_init (init_reiserfs_fs);
-module_exit (exit_reiserfs_fs);
+module_init(init_reiserfs_fs);
+module_exit(exit_reiserfs_fs);
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index 6191909d5165..c92e124f628e 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -11,156 +11,159 @@
 /* access to tail : when one is going to read tail it must make sure, that is not running.
  direct2indirect and indirect2direct can not run concurrently */
 
-
 /* Converts direct items to an unformatted node. Panics if file has no
    tail. -ENOSPC if no disk space for conversion */
 /* path points to first direct item of the file regarless of how many of
    them are there */
-int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode, 
-		     struct path * path, struct buffer_head * unbh,
-		     loff_t tail_offset)
+int direct2indirect(struct reiserfs_transaction_handle *th, struct inode *inode,
+		    struct path *path, struct buffer_head *unbh,
+		    loff_t tail_offset)
 {
-    struct super_block * sb = inode->i_sb;
-    struct buffer_head *up_to_date_bh ;
-    struct item_head * p_le_ih = PATH_PITEM_HEAD (path);
-    unsigned long total_tail = 0 ;
-    struct cpu_key end_key;  /* Key to search for the last byte of the
-				converted item. */
-    struct item_head ind_ih; /* new indirect item to be inserted or
-                                key of unfm pointer to be pasted */
-    int	n_blk_size,
-      n_retval;	  /* returned value for reiserfs_insert_item and clones */
-    unp_t unfm_ptr;  /* Handle on an unformatted node
-				       that will be inserted in the
-				       tree. */
-
-    BUG_ON (!th->t_trans_id);
-
-    REISERFS_SB(sb)->s_direct2indirect ++;
-
-    n_blk_size = sb->s_blocksize;
-
-    /* and key to search for append or insert pointer to the new
-       unformatted node. */
-    copy_item_head (&ind_ih, p_le_ih);
-    set_le_ih_k_offset (&ind_ih, tail_offset);
-    set_le_ih_k_type (&ind_ih, TYPE_INDIRECT);
-
-    /* Set the key to search for the place for new unfm pointer */
-    make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
-
-    // FIXME: we could avoid this 
-    if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) {
-	reiserfs_warning (sb, "PAP-14030: direct2indirect: "
-			"pasted or inserted byte exists in the tree %K. "
-			"Use fsck to repair.", &end_key);
-	pathrelse(path);
-	return -EIO;
-    }
-    
-    p_le_ih = PATH_PITEM_HEAD (path);
-
-    unfm_ptr = cpu_to_le32 (unbh->b_blocknr);
-    
-    if ( is_statdata_le_ih (p_le_ih) )  {
-	/* Insert new indirect item. */
-	set_ih_free_space (&ind_ih, 0); /* delete at nearest future */
-        put_ih_item_len( &ind_ih, UNFM_P_SIZE );
-	PATH_LAST_POSITION (path)++;
-	n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode,
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *up_to_date_bh;
+	struct item_head *p_le_ih = PATH_PITEM_HEAD(path);
+	unsigned long total_tail = 0;
+	struct cpu_key end_key;	/* Key to search for the last byte of the
+				   converted item. */
+	struct item_head ind_ih;	/* new indirect item to be inserted or
+					   key of unfm pointer to be pasted */
+	int n_blk_size, n_retval;	/* returned value for reiserfs_insert_item and clones */
+	unp_t unfm_ptr;		/* Handle on an unformatted node
+				   that will be inserted in the
+				   tree. */
+
+	BUG_ON(!th->t_trans_id);
+
+	REISERFS_SB(sb)->s_direct2indirect++;
+
+	n_blk_size = sb->s_blocksize;
+
+	/* and key to search for append or insert pointer to the new
+	   unformatted node. */
+	copy_item_head(&ind_ih, p_le_ih);
+	set_le_ih_k_offset(&ind_ih, tail_offset);
+	set_le_ih_k_type(&ind_ih, TYPE_INDIRECT);
+
+	/* Set the key to search for the place for new unfm pointer */
+	make_cpu_key(&end_key, inode, tail_offset, TYPE_INDIRECT, 4);
+
+	// FIXME: we could avoid this 
+	if (search_for_position_by_key(sb, &end_key, path) == POSITION_FOUND) {
+		reiserfs_warning(sb, "PAP-14030: direct2indirect: "
+				 "pasted or inserted byte exists in the tree %K. "
+				 "Use fsck to repair.", &end_key);
+		pathrelse(path);
+		return -EIO;
+	}
+
+	p_le_ih = PATH_PITEM_HEAD(path);
+
+	unfm_ptr = cpu_to_le32(unbh->b_blocknr);
+
+	if (is_statdata_le_ih(p_le_ih)) {
+		/* Insert new indirect item. */
+		set_ih_free_space(&ind_ih, 0);	/* delete at nearest future */
+		put_ih_item_len(&ind_ih, UNFM_P_SIZE);
+		PATH_LAST_POSITION(path)++;
+		n_retval =
+		    reiserfs_insert_item(th, path, &end_key, &ind_ih, inode,
 					 (char *)&unfm_ptr);
-    } else {
-	/* Paste into last indirect item of an object. */
-	n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
-					    (char *)&unfm_ptr, UNFM_P_SIZE);
-    }
-    if ( n_retval ) {
-	return n_retval;
-    }
-
-    // note: from here there are two keys which have matching first
-    // three key components. They only differ by the fourth one.
-
-
-    /* Set the key to search for the direct items of the file */
-    make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4);
-
-    /* Move bytes from the direct items to the new unformatted node
-       and delete them. */
-    while (1)  {
-	int tail_size;
-
-	/* end_key.k_offset is set so, that we will always have found
-           last item of the file */
-	if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND )
-	    reiserfs_panic (sb, "PAP-14050: direct2indirect: "
-			    "direct item (%K) not found", &end_key);
-	p_le_ih = PATH_PITEM_HEAD (path);
-	RFALSE( !is_direct_le_ih (p_le_ih),
-	        "vs-14055: direct item expected(%K), found %h",
-                &end_key, p_le_ih);
-        tail_size = (le_ih_k_offset (p_le_ih) & (n_blk_size - 1))
-            + ih_item_len(p_le_ih) - 1;
-
-	/* we only send the unbh pointer if the buffer is not up to date.
-	** this avoids overwriting good data from writepage() with old data
-	** from the disk or buffer cache
-	** Special case: unbh->b_page will be NULL if we are coming through
-	** DIRECT_IO handler here.
-	*/
-	if (!unbh->b_page || buffer_uptodate(unbh) || PageUptodate(unbh->b_page)) {
-	    up_to_date_bh = NULL ;
 	} else {
-	    up_to_date_bh = unbh ;
+		/* Paste into last indirect item of an object. */
+		n_retval = reiserfs_paste_into_item(th, path, &end_key, inode,
+						    (char *)&unfm_ptr,
+						    UNFM_P_SIZE);
 	}
-	n_retval = reiserfs_delete_item (th, path, &end_key, inode, 
-	                                 up_to_date_bh) ;
-
-	total_tail += n_retval ;
-	if (tail_size == n_retval)
-	    // done: file does not have direct items anymore
-	    break;
-
-    }
-    /* if we've copied bytes from disk into the page, we need to zero
-    ** out the unused part of the block (it was not up to date before)
-    */
-    if (up_to_date_bh) {
-        unsigned pgoff = (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
-	char *kaddr=kmap_atomic(up_to_date_bh->b_page, KM_USER0);
-	memset(kaddr + pgoff, 0, n_blk_size - total_tail) ;
-	kunmap_atomic(kaddr, KM_USER0);
-    }
-
-    REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
-
-    return 0;
-}
+	if (n_retval) {
+		return n_retval;
+	}
+	// note: from here there are two keys which have matching first
+	// three key components. They only differ by the fourth one.
+
+	/* Set the key to search for the direct items of the file */
+	make_cpu_key(&end_key, inode, max_reiserfs_offset(inode), TYPE_DIRECT,
+		     4);
+
+	/* Move bytes from the direct items to the new unformatted node
+	   and delete them. */
+	while (1) {
+		int tail_size;
+
+		/* end_key.k_offset is set so, that we will always have found
+		   last item of the file */
+		if (search_for_position_by_key(sb, &end_key, path) ==
+		    POSITION_FOUND)
+			reiserfs_panic(sb,
+				       "PAP-14050: direct2indirect: "
+				       "direct item (%K) not found", &end_key);
+		p_le_ih = PATH_PITEM_HEAD(path);
+		RFALSE(!is_direct_le_ih(p_le_ih),
+		       "vs-14055: direct item expected(%K), found %h",
+		       &end_key, p_le_ih);
+		tail_size = (le_ih_k_offset(p_le_ih) & (n_blk_size - 1))
+		    + ih_item_len(p_le_ih) - 1;
+
+		/* we only send the unbh pointer if the buffer is not up to date.
+		 ** this avoids overwriting good data from writepage() with old data
+		 ** from the disk or buffer cache
+		 ** Special case: unbh->b_page will be NULL if we are coming through
+		 ** DIRECT_IO handler here.
+		 */
+		if (!unbh->b_page || buffer_uptodate(unbh)
+		    || PageUptodate(unbh->b_page)) {
+			up_to_date_bh = NULL;
+		} else {
+			up_to_date_bh = unbh;
+		}
+		n_retval = reiserfs_delete_item(th, path, &end_key, inode,
+						up_to_date_bh);
+
+		total_tail += n_retval;
+		if (tail_size == n_retval)
+			// done: file does not have direct items anymore
+			break;
 
+	}
+	/* if we've copied bytes from disk into the page, we need to zero
+	 ** out the unused part of the block (it was not up to date before)
+	 */
+	if (up_to_date_bh) {
+		unsigned pgoff =
+		    (tail_offset + total_tail - 1) & (PAGE_CACHE_SIZE - 1);
+		char *kaddr = kmap_atomic(up_to_date_bh->b_page, KM_USER0);
+		memset(kaddr + pgoff, 0, n_blk_size - total_tail);
+		kunmap_atomic(kaddr, KM_USER0);
+	}
+
+	REISERFS_I(inode)->i_first_direct_byte = U32_MAX;
+
+	return 0;
+}
 
 /* stolen from fs/buffer.c */
-void reiserfs_unmap_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
-    if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
-      BUG() ;
-    }
-    clear_buffer_dirty(bh) ;
-    /* Remove the buffer from whatever list it belongs to. We are mostly
-       interested in removing it from per-sb j_dirty_buffers list, to avoid
-        BUG() on attempt to write not mapped buffer */
-    if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
-	struct inode *inode = bh->b_page->mapping->host;
-	struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
-	spin_lock(&j->j_dirty_buffers_lock);
-	list_del_init(&bh->b_assoc_buffers);
-	reiserfs_free_jh(bh);
-	spin_unlock(&j->j_dirty_buffers_lock);
-    }
-    clear_buffer_mapped(bh) ;
-    clear_buffer_req(bh) ;
-    clear_buffer_new(bh);
-    bh->b_bdev = NULL;
-    unlock_buffer(bh) ;
+void reiserfs_unmap_buffer(struct buffer_head *bh)
+{
+	lock_buffer(bh);
+	if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+		BUG();
+	}
+	clear_buffer_dirty(bh);
+	/* Remove the buffer from whatever list it belongs to. We are mostly
+	   interested in removing it from per-sb j_dirty_buffers list, to avoid
+	   BUG() on attempt to write not mapped buffer */
+	if ((!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) {
+		struct inode *inode = bh->b_page->mapping->host;
+		struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb);
+		spin_lock(&j->j_dirty_buffers_lock);
+		list_del_init(&bh->b_assoc_buffers);
+		reiserfs_free_jh(bh);
+		spin_unlock(&j->j_dirty_buffers_lock);
+	}
+	clear_buffer_mapped(bh);
+	clear_buffer_req(bh);
+	clear_buffer_new(bh);
+	bh->b_bdev = NULL;
+	unlock_buffer(bh);
 }
 
 /* this first locks inode (neither reads nor sync are permitted),
@@ -169,108 +172,108 @@ void reiserfs_unmap_buffer(struct buffer_head *bh) {
    what we expect from it (number of cut bytes). But when tail remains
    in the unformatted node, we set mode to SKIP_BALANCING and unlock
    inode */
-int indirect2direct (struct reiserfs_transaction_handle *th, 
-		     struct inode * p_s_inode,
-		     struct page *page, 
-		     struct path * p_s_path, /* path to the indirect item. */
-		     const struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */
-		     loff_t n_new_file_size, /* New file size. */
-		     char * p_c_mode)
+int indirect2direct(struct reiserfs_transaction_handle *th, struct inode *p_s_inode, struct page *page, struct path *p_s_path,	/* path to the indirect item. */
+		    const struct cpu_key *p_s_item_key,	/* Key to look for unformatted node pointer to be cut. */
+		    loff_t n_new_file_size,	/* New file size. */
+		    char *p_c_mode)
 {
-    struct super_block * p_s_sb = p_s_inode->i_sb;
-    struct item_head      s_ih;
-    unsigned long n_block_size = p_s_sb->s_blocksize;
-    char * tail;
-    int tail_len, round_tail_len;
-    loff_t pos, pos1; /* position of first byte of the tail */
-    struct cpu_key key;
+	struct super_block *p_s_sb = p_s_inode->i_sb;
+	struct item_head s_ih;
+	unsigned long n_block_size = p_s_sb->s_blocksize;
+	char *tail;
+	int tail_len, round_tail_len;
+	loff_t pos, pos1;	/* position of first byte of the tail */
+	struct cpu_key key;
 
-    BUG_ON (!th->t_trans_id);
+	BUG_ON(!th->t_trans_id);
 
-    REISERFS_SB(p_s_sb)->s_indirect2direct ++;
+	REISERFS_SB(p_s_sb)->s_indirect2direct++;
 
-    *p_c_mode = M_SKIP_BALANCING;
+	*p_c_mode = M_SKIP_BALANCING;
 
-    /* store item head path points to. */
-    copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path));
-
-    tail_len = (n_new_file_size & (n_block_size - 1));
-    if (get_inode_sd_version (p_s_inode) == STAT_DATA_V2)
-	round_tail_len = ROUND_UP (tail_len);
-    else
-	round_tail_len = tail_len;
-
-    pos = le_ih_k_offset (&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize;
-    pos1 = pos;
-
-    // we are protected by i_sem. The tail can not disapper, not
-    // append can be done either
-    // we are in truncate or packing tail in file_release
-
-    tail = (char *)kmap(page) ; /* this can schedule */
-
-    if (path_changed (&s_ih, p_s_path)) {
-	/* re-search indirect item */
-	if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND )
-	    reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: "
-			   "item to be converted %K does not exist", p_s_item_key);
+	/* store item head path points to. */
 	copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
+
+	tail_len = (n_new_file_size & (n_block_size - 1));
+	if (get_inode_sd_version(p_s_inode) == STAT_DATA_V2)
+		round_tail_len = ROUND_UP(tail_len);
+	else
+		round_tail_len = tail_len;
+
+	pos =
+	    le_ih_k_offset(&s_ih) - 1 + (ih_item_len(&s_ih) / UNFM_P_SIZE -
+					 1) * p_s_sb->s_blocksize;
+	pos1 = pos;
+
+	// we are protected by i_sem. The tail can not disapper, not
+	// append can be done either
+	// we are in truncate or packing tail in file_release
+
+	tail = (char *)kmap(page);	/* this can schedule */
+
+	if (path_changed(&s_ih, p_s_path)) {
+		/* re-search indirect item */
+		if (search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path)
+		    == POSITION_NOT_FOUND)
+			reiserfs_panic(p_s_sb,
+				       "PAP-5520: indirect2direct: "
+				       "item to be converted %K does not exist",
+				       p_s_item_key);
+		copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path));
 #ifdef CONFIG_REISERFS_CHECK
-	pos = le_ih_k_offset (&s_ih) - 1 + 
-	    (ih_item_len(&s_ih) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize;
-	if (pos != pos1)
-	    reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: "
-			    "tail position changed while we were reading it");
+		pos = le_ih_k_offset(&s_ih) - 1 +
+		    (ih_item_len(&s_ih) / UNFM_P_SIZE -
+		     1) * p_s_sb->s_blocksize;
+		if (pos != pos1)
+			reiserfs_panic(p_s_sb, "vs-5530: indirect2direct: "
+				       "tail position changed while we were reading it");
 #endif
-    }
-
-
-    /* Set direct item header to insert. */
-    make_le_item_head (&s_ih, NULL, get_inode_item_key_version (p_s_inode), pos1 + 1,
-		       TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/);
-
-    /* we want a pointer to the first byte of the tail in the page.
-    ** the page was locked and this part of the page was up to date when
-    ** indirect2direct was called, so we know the bytes are still valid
-    */
-    tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ;
-
-    PATH_LAST_POSITION(p_s_path)++;
-
-    key = *p_s_item_key;
-    set_cpu_key_k_type (&key, TYPE_DIRECT);
-    key.key_length = 4;
-    /* Insert tail as new direct item in the tree */
-    if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
-			      tail ? tail : NULL) < 0 ) {
-	/* No disk memory. So we can not convert last unformatted node
-	   to the direct item.  In this case we used to adjust
-	   indirect items's ih_free_space. Now ih_free_space is not
-	   used, it would be ideal to write zeros to corresponding
-	   unformatted node. For now i_size is considered as guard for
-	   going out of file size */
-	kunmap(page) ;
-	return n_block_size - round_tail_len;
-    }
-    kunmap(page) ;
-
-    /* make sure to get the i_blocks changes from reiserfs_insert_item */
-    reiserfs_update_sd(th, p_s_inode);
+	}
 
-    // note: we have now the same as in above direct2indirect
-    // conversion: there are two keys which have matching first three
-    // key components. They only differ by the fouhth one.
+	/* Set direct item header to insert. */
+	make_le_item_head(&s_ih, NULL, get_inode_item_key_version(p_s_inode),
+			  pos1 + 1, TYPE_DIRECT, round_tail_len,
+			  0xffff /*ih_free_space */ );
+
+	/* we want a pointer to the first byte of the tail in the page.
+	 ** the page was locked and this part of the page was up to date when
+	 ** indirect2direct was called, so we know the bytes are still valid
+	 */
+	tail = tail + (pos & (PAGE_CACHE_SIZE - 1));
+
+	PATH_LAST_POSITION(p_s_path)++;
+
+	key = *p_s_item_key;
+	set_cpu_key_k_type(&key, TYPE_DIRECT);
+	key.key_length = 4;
+	/* Insert tail as new direct item in the tree */
+	if (reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode,
+				 tail ? tail : NULL) < 0) {
+		/* No disk memory. So we can not convert last unformatted node
+		   to the direct item.  In this case we used to adjust
+		   indirect items's ih_free_space. Now ih_free_space is not
+		   used, it would be ideal to write zeros to corresponding
+		   unformatted node. For now i_size is considered as guard for
+		   going out of file size */
+		kunmap(page);
+		return n_block_size - round_tail_len;
+	}
+	kunmap(page);
 
-    /* We have inserted new direct item and must remove last
-       unformatted node. */
-    *p_c_mode = M_CUT;
+	/* make sure to get the i_blocks changes from reiserfs_insert_item */
+	reiserfs_update_sd(th, p_s_inode);
 
-    /* we store position of first direct item in the in-core inode */
-    //mark_file_with_tail (p_s_inode, pos1 + 1);
-    REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1;
-
-    return n_block_size - round_tail_len;
-}
+	// note: we have now the same as in above direct2indirect
+	// conversion: there are two keys which have matching first three
+	// key components. They only differ by the fouhth one.
 
+	/* We have inserted new direct item and must remove last
+	   unformatted node. */
+	*p_c_mode = M_CUT;
 
+	/* we store position of first direct item in the in-core inode */
+	//mark_file_with_tail (p_s_inode, pos1 + 1);
+	REISERFS_I(p_s_inode)->i_first_direct_byte = pos1 + 1;
 
+	return n_block_size - round_tail_len;
+}
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 45582fe8b466..e386d3db3051 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -51,67 +51,68 @@
 #define PRIVROOT_NAME ".reiserfs_priv"
 #define XAROOT_NAME   "xattrs"
 
-static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix);
+static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
+								*prefix);
 
-static struct dentry *
-create_xa_root (struct super_block *sb)
+static struct dentry *create_xa_root(struct super_block *sb)
 {
-    struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root);
-    struct dentry *xaroot;
-
-    /* This needs to be created at mount-time */
-    if (!privroot)
-        return ERR_PTR(-EOPNOTSUPP);
-
-    xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
-    if (IS_ERR (xaroot)) {
-        goto out;
-    } else if (!xaroot->d_inode) {
-        int err;
-        down (&privroot->d_inode->i_sem);
-        err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700);
-        up (&privroot->d_inode->i_sem);
-
-        if (err) {
-            dput (xaroot);
-            dput (privroot);
-            return ERR_PTR (err);
-        }
-        REISERFS_SB(sb)->xattr_root = dget (xaroot);
-    }
-
-out:
-    dput (privroot);
-    return xaroot;
+	struct dentry *privroot = dget(REISERFS_SB(sb)->priv_root);
+	struct dentry *xaroot;
+
+	/* This needs to be created at mount-time */
+	if (!privroot)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
+	if (IS_ERR(xaroot)) {
+		goto out;
+	} else if (!xaroot->d_inode) {
+		int err;
+		down(&privroot->d_inode->i_sem);
+		err =
+		    privroot->d_inode->i_op->mkdir(privroot->d_inode, xaroot,
+						   0700);
+		up(&privroot->d_inode->i_sem);
+
+		if (err) {
+			dput(xaroot);
+			dput(privroot);
+			return ERR_PTR(err);
+		}
+		REISERFS_SB(sb)->xattr_root = dget(xaroot);
+	}
+
+      out:
+	dput(privroot);
+	return xaroot;
 }
 
 /* This will return a dentry, or error, refering to the xa root directory.
  * If the xa root doesn't exist yet, the dentry will be returned without
  * an associated inode. This dentry can be used with ->mkdir to create
  * the xa directory. */
-static struct dentry *
-__get_xa_root (struct super_block *s)
+static struct dentry *__get_xa_root(struct super_block *s)
 {
-    struct dentry *privroot = dget (REISERFS_SB(s)->priv_root);
-    struct dentry *xaroot = NULL;
-
-    if (IS_ERR (privroot) || !privroot)
-        return privroot;
-
-    xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME));
-    if (IS_ERR (xaroot)) {
-        goto out;
-    } else if (!xaroot->d_inode) {
-        dput (xaroot);
-        xaroot = NULL;
-        goto out;
-    }
-
-    REISERFS_SB(s)->xattr_root = dget (xaroot);
-
-out:
-    dput (privroot);
-    return xaroot;
+	struct dentry *privroot = dget(REISERFS_SB(s)->priv_root);
+	struct dentry *xaroot = NULL;
+
+	if (IS_ERR(privroot) || !privroot)
+		return privroot;
+
+	xaroot = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME));
+	if (IS_ERR(xaroot)) {
+		goto out;
+	} else if (!xaroot->d_inode) {
+		dput(xaroot);
+		xaroot = NULL;
+		goto out;
+	}
+
+	REISERFS_SB(s)->xattr_root = dget(xaroot);
+
+      out:
+	dput(privroot);
+	return xaroot;
 }
 
 /* Returns the dentry (or NULL) referring to the root of the extended
@@ -119,147 +120,145 @@ out:
  * Otherwise, we attempt to retreive it from disk. It may also return
  * a pointer-encoded error.
  */
-static inline struct dentry *
-get_xa_root (struct super_block *s)
+static inline struct dentry *get_xa_root(struct super_block *s)
 {
-    struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root);
+	struct dentry *dentry = dget(REISERFS_SB(s)->xattr_root);
 
-    if (!dentry)
-        dentry = __get_xa_root (s);
+	if (!dentry)
+		dentry = __get_xa_root(s);
 
-    return dentry;
+	return dentry;
 }
 
 /* Opens the directory corresponding to the inode's extended attribute store.
  * If flags allow, the tree to the directory may be created. If creation is
  * prohibited, -ENODATA is returned. */
-static struct dentry *
-open_xa_dir (const struct inode *inode, int flags)
+static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 {
-    struct dentry *xaroot, *xadir;
-    char namebuf[17];
-
-    xaroot = get_xa_root (inode->i_sb);
-    if (IS_ERR (xaroot)) {
-        return xaroot;
-    } else if (!xaroot) {
-        if (flags == 0 || flags & XATTR_CREATE) {
-            xaroot = create_xa_root (inode->i_sb);
-            if (IS_ERR (xaroot))
-                return xaroot;
-        }
-        if (!xaroot)
-            return ERR_PTR (-ENODATA);
-    }
-
-    /* ok, we have xaroot open */
-
-    snprintf (namebuf, sizeof (namebuf), "%X.%X",
-              le32_to_cpu (INODE_PKEY (inode)->k_objectid),
-              inode->i_generation);
-    xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf));
-    if (IS_ERR (xadir)) {
-        dput (xaroot);
-        return xadir;
-    }
-
-    if (!xadir->d_inode) {
-        int err;
-        if (flags == 0 || flags & XATTR_CREATE) {
-            /* Although there is nothing else trying to create this directory,
-             * another directory with the same hash may be created, so we need
-             * to protect against that */
-            err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700);
-            if (err) {
-                dput (xaroot);
-                dput (xadir);
-                return ERR_PTR (err);
-            }
-        }
-        if (!xadir->d_inode) {
-            dput (xaroot);
-            dput (xadir);
-            return ERR_PTR (-ENODATA);
-        }
-    }
-
-    dput (xaroot);
-    return xadir;
+	struct dentry *xaroot, *xadir;
+	char namebuf[17];
+
+	xaroot = get_xa_root(inode->i_sb);
+	if (IS_ERR(xaroot)) {
+		return xaroot;
+	} else if (!xaroot) {
+		if (flags == 0 || flags & XATTR_CREATE) {
+			xaroot = create_xa_root(inode->i_sb);
+			if (IS_ERR(xaroot))
+				return xaroot;
+		}
+		if (!xaroot)
+			return ERR_PTR(-ENODATA);
+	}
+
+	/* ok, we have xaroot open */
+
+	snprintf(namebuf, sizeof(namebuf), "%X.%X",
+		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
+		 inode->i_generation);
+	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
+	if (IS_ERR(xadir)) {
+		dput(xaroot);
+		return xadir;
+	}
+
+	if (!xadir->d_inode) {
+		int err;
+		if (flags == 0 || flags & XATTR_CREATE) {
+			/* Although there is nothing else trying to create this directory,
+			 * another directory with the same hash may be created, so we need
+			 * to protect against that */
+			err =
+			    xaroot->d_inode->i_op->mkdir(xaroot->d_inode, xadir,
+							 0700);
+			if (err) {
+				dput(xaroot);
+				dput(xadir);
+				return ERR_PTR(err);
+			}
+		}
+		if (!xadir->d_inode) {
+			dput(xaroot);
+			dput(xadir);
+			return ERR_PTR(-ENODATA);
+		}
+	}
+
+	dput(xaroot);
+	return xadir;
 }
 
 /* Returns a dentry corresponding to a specific extended attribute file
  * for the inode. If flags allow, the file is created. Otherwise, a
  * valid or negative dentry, or an error is returned. */
-static struct dentry *
-get_xa_file_dentry (const struct inode *inode, const char *name, int flags)
+static struct dentry *get_xa_file_dentry(const struct inode *inode,
+					 const char *name, int flags)
 {
-    struct dentry *xadir, *xafile;
-    int err = 0;
-
-    xadir = open_xa_dir (inode, flags);
-    if (IS_ERR (xadir)) {
-        return ERR_PTR (PTR_ERR (xadir));
-    } else if (xadir && !xadir->d_inode) {
-        dput (xadir);
-        return ERR_PTR (-ENODATA);
-    }
-
-    xafile = lookup_one_len (name, xadir, strlen (name));
-    if (IS_ERR (xafile)) {
-        dput (xadir);
-        return ERR_PTR (PTR_ERR (xafile));
-    }
-
-    if (xafile->d_inode) { /* file exists */
-        if (flags & XATTR_CREATE) {
-            err = -EEXIST;
-            dput (xafile);
-            goto out;
-        }
-    } else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
-        goto out;
-    } else {
-        /* inode->i_sem is down, so nothing else can try to create
-         * the same xattr */
-        err = xadir->d_inode->i_op->create (xadir->d_inode, xafile,
-                                            0700|S_IFREG, NULL);
-
-        if (err) {
-            dput (xafile);
-            goto out;
-        }
-    }
-
-out:
-    dput (xadir);
-    if (err)
-        xafile = ERR_PTR (err);
-    return xafile;
-}
+	struct dentry *xadir, *xafile;
+	int err = 0;
+
+	xadir = open_xa_dir(inode, flags);
+	if (IS_ERR(xadir)) {
+		return ERR_PTR(PTR_ERR(xadir));
+	} else if (xadir && !xadir->d_inode) {
+		dput(xadir);
+		return ERR_PTR(-ENODATA);
+	}
+
+	xafile = lookup_one_len(name, xadir, strlen(name));
+	if (IS_ERR(xafile)) {
+		dput(xadir);
+		return ERR_PTR(PTR_ERR(xafile));
+	}
+
+	if (xafile->d_inode) {	/* file exists */
+		if (flags & XATTR_CREATE) {
+			err = -EEXIST;
+			dput(xafile);
+			goto out;
+		}
+	} else if (flags & XATTR_REPLACE || flags & FL_READONLY) {
+		goto out;
+	} else {
+		/* inode->i_sem is down, so nothing else can try to create
+		 * the same xattr */
+		err = xadir->d_inode->i_op->create(xadir->d_inode, xafile,
+						   0700 | S_IFREG, NULL);
+
+		if (err) {
+			dput(xafile);
+			goto out;
+		}
+	}
 
+      out:
+	dput(xadir);
+	if (err)
+		xafile = ERR_PTR(err);
+	return xafile;
+}
 
 /* Opens a file pointer to the attribute associated with inode */
-static struct file *
-open_xa_file (const struct inode *inode, const char *name, int flags)
+static struct file *open_xa_file(const struct inode *inode, const char *name,
+				 int flags)
 {
-    struct dentry *xafile;
-    struct file *fp;
-
-    xafile = get_xa_file_dentry (inode, name, flags);
-    if (IS_ERR (xafile))
-        return ERR_PTR (PTR_ERR (xafile));
-    else if (!xafile->d_inode) {
-        dput (xafile);
-        return ERR_PTR (-ENODATA);
-    }
+	struct dentry *xafile;
+	struct file *fp;
+
+	xafile = get_xa_file_dentry(inode, name, flags);
+	if (IS_ERR(xafile))
+		return ERR_PTR(PTR_ERR(xafile));
+	else if (!xafile->d_inode) {
+		dput(xafile);
+		return ERR_PTR(-ENODATA);
+	}
 
-    fp = dentry_open (xafile, NULL, O_RDWR);
-    /* dentry_open dputs the dentry if it fails */
+	fp = dentry_open(xafile, NULL, O_RDWR);
+	/* dentry_open dputs the dentry if it fails */
 
-    return fp;
+	return fp;
 }
 
-
 /*
  * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but
  * we need to drop the path before calling the filldir struct.  That
@@ -273,139 +272,146 @@ open_xa_file (const struct inode *inode, const char *name, int flags)
  * we're called with i_sem held, so there are no worries about the directory
  * changing underneath us.
  */
-static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir)
+static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
-    struct inode *inode = filp->f_dentry->d_inode;
-    struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
-    INITIALIZE_PATH (path_to_entry);
-    struct buffer_head * bh;
-    int entry_num;
-    struct item_head * ih, tmp_ih;
-    int search_res;
-    char * local_buf;
-    loff_t next_pos;
-    char small_buf[32] ; /* avoid kmalloc if we can */
-    struct reiserfs_de_head *deh;
-    int d_reclen;
-    char * d_name;
-    off_t d_off;
-    ino_t d_ino;
-    struct reiserfs_dir_entry de;
-
-
-    /* form key for search the next directory entry using f_pos field of
-       file structure */
-    next_pos = max_reiserfs_offset(inode);
-
-    while (1) {
-research:
-	if (next_pos <= DOT_DOT_OFFSET)
-	    break;
-	make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
-
-	search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de);
-	if (search_res == IO_ERROR) {
-	    // FIXME: we could just skip part of directory which could
-	    // not be read
-	    pathrelse(&path_to_entry);
-	    return -EIO;
-	}
-
-	if (search_res == NAME_NOT_FOUND)
-	    de.de_entry_num--;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct cpu_key pos_key;	/* key of current position in the directory (key of directory entry) */
+	INITIALIZE_PATH(path_to_entry);
+	struct buffer_head *bh;
+	int entry_num;
+	struct item_head *ih, tmp_ih;
+	int search_res;
+	char *local_buf;
+	loff_t next_pos;
+	char small_buf[32];	/* avoid kmalloc if we can */
+	struct reiserfs_de_head *deh;
+	int d_reclen;
+	char *d_name;
+	off_t d_off;
+	ino_t d_ino;
+	struct reiserfs_dir_entry de;
+
+	/* form key for search the next directory entry using f_pos field of
+	   file structure */
+	next_pos = max_reiserfs_offset(inode);
+
+	while (1) {
+	      research:
+		if (next_pos <= DOT_DOT_OFFSET)
+			break;
+		make_cpu_key(&pos_key, inode, next_pos, TYPE_DIRENTRY, 3);
+
+		search_res =
+		    search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry,
+					&de);
+		if (search_res == IO_ERROR) {
+			// FIXME: we could just skip part of directory which could
+			// not be read
+			pathrelse(&path_to_entry);
+			return -EIO;
+		}
 
-	set_de_name_and_namelen(&de);
-	entry_num = de.de_entry_num;
-	deh = &(de.de_deh[entry_num]);
+		if (search_res == NAME_NOT_FOUND)
+			de.de_entry_num--;
 
-	bh = de.de_bh;
-	ih = de.de_ih;
+		set_de_name_and_namelen(&de);
+		entry_num = de.de_entry_num;
+		deh = &(de.de_deh[entry_num]);
 
-	if (!is_direntry_le_ih(ih)) {
-            reiserfs_warning(inode->i_sb, "not direntry %h", ih);
-	    break;
-        }
-	copy_item_head(&tmp_ih, ih);
+		bh = de.de_bh;
+		ih = de.de_ih;
 
-	/* we must have found item, that is item of this directory, */
-	RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key),
-		"vs-9000: found item %h does not match to dir we readdir %K",
-		ih, &pos_key);
+		if (!is_direntry_le_ih(ih)) {
+			reiserfs_warning(inode->i_sb, "not direntry %h", ih);
+			break;
+		}
+		copy_item_head(&tmp_ih, ih);
 
-	if (deh_offset(deh) <= DOT_DOT_OFFSET) {
-	    break;
-	}
+		/* we must have found item, that is item of this directory, */
+		RFALSE(COMP_SHORT_KEYS(&(ih->ih_key), &pos_key),
+		       "vs-9000: found item %h does not match to dir we readdir %K",
+		       ih, &pos_key);
 
-	/* look for the previous entry in the directory */
-	next_pos = deh_offset (deh) - 1;
+		if (deh_offset(deh) <= DOT_DOT_OFFSET) {
+			break;
+		}
 
-	if (!de_visible (deh))
-	    /* it is hidden entry */
-	    continue;
+		/* look for the previous entry in the directory */
+		next_pos = deh_offset(deh) - 1;
 
-	d_reclen = entry_length(bh, ih, entry_num);
-	d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh);
-	d_off = deh_offset (deh);
-	d_ino = deh_objectid (deh);
+		if (!de_visible(deh))
+			/* it is hidden entry */
+			continue;
 
-	if (!d_name[d_reclen - 1])
-	    d_reclen = strlen (d_name);
+		d_reclen = entry_length(bh, ih, entry_num);
+		d_name = B_I_DEH_ENTRY_FILE_NAME(bh, ih, deh);
+		d_off = deh_offset(deh);
+		d_ino = deh_objectid(deh);
 
-	if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){
-	    /* too big to send back to VFS */
-	    continue ;
-	}
+		if (!d_name[d_reclen - 1])
+			d_reclen = strlen(d_name);
 
-        /* Ignore the .reiserfs_priv entry */
-        if (reiserfs_xattrs (inode->i_sb) &&
-            !old_format_only(inode->i_sb) &&
-            deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid))
-          continue;
+		if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)) {
+			/* too big to send back to VFS */
+			continue;
+		}
 
-	if (d_reclen <= 32) {
-	  local_buf = small_buf ;
-	} else {
-	    local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ;
-	    if (!local_buf) {
-		pathrelse (&path_to_entry);
-		return -ENOMEM ;
-	    }
-	    if (item_moved (&tmp_ih, &path_to_entry)) {
-		reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
-
-		/* sigh, must retry.  Do this same offset again */
-		next_pos = d_off;
-		goto research;
-	    }
-	}
+		/* Ignore the .reiserfs_priv entry */
+		if (reiserfs_xattrs(inode->i_sb) &&
+		    !old_format_only(inode->i_sb) &&
+		    deh_objectid(deh) ==
+		    le32_to_cpu(INODE_PKEY
+				(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->
+				k_objectid))
+			continue;
+
+		if (d_reclen <= 32) {
+			local_buf = small_buf;
+		} else {
+			local_buf =
+			    reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb);
+			if (!local_buf) {
+				pathrelse(&path_to_entry);
+				return -ENOMEM;
+			}
+			if (item_moved(&tmp_ih, &path_to_entry)) {
+				reiserfs_kfree(local_buf, d_reclen,
+					       inode->i_sb);
+
+				/* sigh, must retry.  Do this same offset again */
+				next_pos = d_off;
+				goto research;
+			}
+		}
 
-	// Note, that we copy name to user space via temporary
-	// buffer (local_buf) because filldir will block if
-	// user space buffer is swapped out. At that time
-	// entry can move to somewhere else
-	memcpy (local_buf, d_name, d_reclen);
+		// Note, that we copy name to user space via temporary
+		// buffer (local_buf) because filldir will block if
+		// user space buffer is swapped out. At that time
+		// entry can move to somewhere else
+		memcpy(local_buf, d_name, d_reclen);
 
-	/* the filldir function might need to start transactions,
-	 * or do who knows what.  Release the path now that we've
-	 * copied all the important stuff out of the deh
-	 */
-	pathrelse (&path_to_entry);
-
-	if (filldir (dirent, local_buf, d_reclen, d_off, d_ino,
-		     DT_UNKNOWN) < 0) {
-	    if (local_buf != small_buf) {
-		reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
-	    }
-	    goto end;
-	}
-	if (local_buf != small_buf) {
-	    reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ;
-	}
-    } /* while */
+		/* the filldir function might need to start transactions,
+		 * or do who knows what.  Release the path now that we've
+		 * copied all the important stuff out of the deh
+		 */
+		pathrelse(&path_to_entry);
+
+		if (filldir(dirent, local_buf, d_reclen, d_off, d_ino,
+			    DT_UNKNOWN) < 0) {
+			if (local_buf != small_buf) {
+				reiserfs_kfree(local_buf, d_reclen,
+					       inode->i_sb);
+			}
+			goto end;
+		}
+		if (local_buf != small_buf) {
+			reiserfs_kfree(local_buf, d_reclen, inode->i_sb);
+		}
+	}			/* while */
 
-end:
-    pathrelse (&path_to_entry);
-    return 0;
+      end:
+	pathrelse(&path_to_entry);
+	return 0;
 }
 
 /*
@@ -417,63 +423,59 @@ end:
 static
 int xattr_readdir(struct file *file, filldir_t filler, void *buf)
 {
-        struct inode *inode = file->f_dentry->d_inode;
-        int res = -ENOTDIR;
-        if (!file->f_op || !file->f_op->readdir)
-                goto out;
-        down(&inode->i_sem);
+	struct inode *inode = file->f_dentry->d_inode;
+	int res = -ENOTDIR;
+	if (!file->f_op || !file->f_op->readdir)
+		goto out;
+	down(&inode->i_sem);
 //        down(&inode->i_zombie);
-        res = -ENOENT;
-        if (!IS_DEADDIR(inode)) {
-                lock_kernel();
-                res = __xattr_readdir(file, buf, filler);
-                unlock_kernel();
-        }
+	res = -ENOENT;
+	if (!IS_DEADDIR(inode)) {
+		lock_kernel();
+		res = __xattr_readdir(file, buf, filler);
+		unlock_kernel();
+	}
 //        up(&inode->i_zombie);
-        up(&inode->i_sem);
-out:
-        return res;
+	up(&inode->i_sem);
+      out:
+	return res;
 }
 
-
 /* Internal operations on file data */
-static inline void
-reiserfs_put_page(struct page *page)
+static inline void reiserfs_put_page(struct page *page)
 {
-        kunmap(page);
-        page_cache_release(page);
+	kunmap(page);
+	page_cache_release(page);
 }
 
-static struct page *
-reiserfs_get_page(struct inode *dir, unsigned long n)
+static struct page *reiserfs_get_page(struct inode *dir, unsigned long n)
 {
-        struct address_space *mapping = dir->i_mapping;
-        struct page *page;
-        /* We can deadlock if we try to free dentries,
-           and an unlink/rmdir has just occured - GFP_NOFS avoids this */
-        mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS;
-        page = read_cache_page (mapping, n,
-                                (filler_t*)mapping->a_ops->readpage, NULL);
-        if (!IS_ERR(page)) {
-                wait_on_page_locked(page);
-                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-
-                if (PageError(page))
-                        goto fail;
-        }
-        return page;
-
-fail:
-        reiserfs_put_page(page);
-        return ERR_PTR(-EIO);
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page;
+	/* We can deadlock if we try to free dentries,
+	   and an unlink/rmdir has just occured - GFP_NOFS avoids this */
+	mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS;
+	page = read_cache_page(mapping, n,
+			       (filler_t *) mapping->a_ops->readpage, NULL);
+	if (!IS_ERR(page)) {
+		wait_on_page_locked(page);
+		kmap(page);
+		if (!PageUptodate(page))
+			goto fail;
+
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+      fail:
+	reiserfs_put_page(page);
+	return ERR_PTR(-EIO);
 }
 
-static inline __u32
-xattr_hash (const char *msg, int len)
+static inline __u32 xattr_hash(const char *msg, int len)
 {
-    return csum_partial (msg, len, 0);
+	return csum_partial(msg, len, 0);
 }
 
 /* Generic extended attribute operations that can be used by xa plugins */
@@ -482,294 +484,300 @@ xattr_hash (const char *msg, int len)
  * inode->i_sem: down
  */
 int
-reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer,
-                    size_t buffer_size, int flags)
+reiserfs_xattr_set(struct inode *inode, const char *name, const void *buffer,
+		   size_t buffer_size, int flags)
 {
-    int err = 0;
-    struct file *fp;
-    struct page *page;
-    char *data;
-    struct address_space *mapping;
-    size_t file_pos = 0;
-    size_t buffer_pos = 0;
-    struct inode *xinode;
-    struct iattr newattrs;
-    __u32 xahash = 0;
-
-    if (IS_RDONLY (inode))
-        return -EROFS;
-
-    if (IS_IMMUTABLE (inode) || IS_APPEND (inode))
-        return -EPERM;
-
-    if (get_inode_sd_version (inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
-
-    /* Empty xattrs are ok, they're just empty files, no hash */
-    if (buffer && buffer_size)
-        xahash = xattr_hash (buffer, buffer_size);
-
-open_file:
-    fp = open_xa_file (inode, name, flags);
-    if (IS_ERR (fp)) {
-        err = PTR_ERR (fp);
-        goto out;
-    }
-
-    xinode = fp->f_dentry->d_inode;
-    REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-
-    /* we need to copy it off.. */
-    if (xinode->i_nlink > 1) {
-	fput(fp);
-        err = reiserfs_xattr_del (inode, name);
-        if (err < 0)
-            goto out;
-        /* We just killed the old one, we're not replacing anymore */
-        if (flags & XATTR_REPLACE)
-            flags &= ~XATTR_REPLACE;
-        goto open_file;
-    }
-
-    /* Resize it so we're ok to write there */
-    newattrs.ia_size = buffer_size;
-    newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
-    down (&xinode->i_sem);
-    err = notify_change(fp->f_dentry, &newattrs);
-    if (err)
-        goto out_filp;
-
-    mapping = xinode->i_mapping;
-    while (buffer_pos < buffer_size || buffer_pos == 0) {
-        size_t chunk;
-        size_t skip = 0;
-        size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
-        if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
-            chunk = PAGE_CACHE_SIZE;
-        else
-            chunk = buffer_size - buffer_pos;
-
-        page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
-        if (IS_ERR (page)) {
-            err = PTR_ERR (page);
-            goto out_filp;
-        }
-
-        lock_page (page);
-        data = page_address (page);
-
-        if (file_pos == 0) {
-            struct reiserfs_xattr_header *rxh;
-            skip = file_pos = sizeof (struct reiserfs_xattr_header);
-            if (chunk + skip > PAGE_CACHE_SIZE)
-                chunk = PAGE_CACHE_SIZE - skip;
-            rxh = (struct reiserfs_xattr_header *)data;
-            rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC);
-            rxh->h_hash = cpu_to_le32 (xahash);
-        }
-
-        err = mapping->a_ops->prepare_write (fp, page, page_offset,
-                                             page_offset + chunk + skip);
-        if (!err) {
-	    if (buffer)
-		memcpy (data + skip, buffer + buffer_pos, chunk);
-            err = mapping->a_ops->commit_write (fp, page, page_offset,
-                                                page_offset + chunk + skip);
+	int err = 0;
+	struct file *fp;
+	struct page *page;
+	char *data;
+	struct address_space *mapping;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	struct inode *xinode;
+	struct iattr newattrs;
+	__u32 xahash = 0;
+
+	if (IS_RDONLY(inode))
+		return -EROFS;
+
+	if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+		return -EPERM;
+
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	/* Empty xattrs are ok, they're just empty files, no hash */
+	if (buffer && buffer_size)
+		xahash = xattr_hash(buffer, buffer_size);
+
+      open_file:
+	fp = open_xa_file(inode, name, flags);
+	if (IS_ERR(fp)) {
+		err = PTR_ERR(fp);
+		goto out;
+	}
+
+	xinode = fp->f_dentry->d_inode;
+	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+	/* we need to copy it off.. */
+	if (xinode->i_nlink > 1) {
+		fput(fp);
+		err = reiserfs_xattr_del(inode, name);
+		if (err < 0)
+			goto out;
+		/* We just killed the old one, we're not replacing anymore */
+		if (flags & XATTR_REPLACE)
+			flags &= ~XATTR_REPLACE;
+		goto open_file;
+	}
+
+	/* Resize it so we're ok to write there */
+	newattrs.ia_size = buffer_size;
+	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
+	down(&xinode->i_sem);
+	err = notify_change(fp->f_dentry, &newattrs);
+	if (err)
+		goto out_filp;
+
+	mapping = xinode->i_mapping;
+	while (buffer_pos < buffer_size || buffer_pos == 0) {
+		size_t chunk;
+		size_t skip = 0;
+		size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1));
+		if (buffer_size - buffer_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = buffer_size - buffer_pos;
+
+		page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_filp;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			if (chunk + skip > PAGE_CACHE_SIZE)
+				chunk = PAGE_CACHE_SIZE - skip;
+			rxh = (struct reiserfs_xattr_header *)data;
+			rxh->h_magic = cpu_to_le32(REISERFS_XATTR_MAGIC);
+			rxh->h_hash = cpu_to_le32(xahash);
+		}
+
+		err = mapping->a_ops->prepare_write(fp, page, page_offset,
+						    page_offset + chunk + skip);
+		if (!err) {
+			if (buffer)
+				memcpy(data + skip, buffer + buffer_pos, chunk);
+			err =
+			    mapping->a_ops->commit_write(fp, page, page_offset,
+							 page_offset + chunk +
+							 skip);
+		}
+		unlock_page(page);
+		reiserfs_put_page(page);
+		buffer_pos += chunk;
+		file_pos += chunk;
+		skip = 0;
+		if (err || buffer_size == 0 || !buffer)
+			break;
+	}
+
+	/* We can't mark the inode dirty if it's not hashed. This is the case
+	 * when we're inheriting the default ACL. If we dirty it, the inode
+	 * gets marked dirty, but won't (ever) make it onto the dirty list until
+	 * it's synced explicitly to clear I_DIRTY. This is bad. */
+	if (!hlist_unhashed(&inode->i_hash)) {
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
 	}
-        unlock_page (page);
-        reiserfs_put_page (page);
-        buffer_pos += chunk;
-        file_pos += chunk;
-        skip = 0;
-        if (err || buffer_size == 0 || !buffer)
-            break;
-    }
-
-    /* We can't mark the inode dirty if it's not hashed. This is the case
-     * when we're inheriting the default ACL. If we dirty it, the inode
-     * gets marked dirty, but won't (ever) make it onto the dirty list until
-     * it's synced explicitly to clear I_DIRTY. This is bad. */
-    if (!hlist_unhashed(&inode->i_hash)) {
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty (inode);
-    }
-
-out_filp:
-    up (&xinode->i_sem);
-    fput(fp);
-
-out:
-    return err;
+
+      out_filp:
+	up(&xinode->i_sem);
+	fput(fp);
+
+      out:
+	return err;
 }
 
 /*
  * inode->i_sem: down
  */
 int
-reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer,
-                    size_t buffer_size)
+reiserfs_xattr_get(const struct inode *inode, const char *name, void *buffer,
+		   size_t buffer_size)
 {
-    ssize_t err = 0;
-    struct file *fp;
-    size_t isize;
-    size_t file_pos = 0;
-    size_t buffer_pos = 0;
-    struct page *page;
-    struct inode *xinode;
-    __u32 hash = 0;
-
-    if (name == NULL)
-        return -EINVAL;
-
-    /* We can't have xattrs attached to v1 items since they don't have
-     * generation numbers */
-    if (get_inode_sd_version (inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
-
-    fp = open_xa_file (inode, name, FL_READONLY);
-    if (IS_ERR (fp)) {
-        err = PTR_ERR (fp);
-        goto out;
-    }
-
-    xinode = fp->f_dentry->d_inode;
-    isize = xinode->i_size;
-    REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
-
-    /* Just return the size needed */
-    if (buffer == NULL) {
-        err = isize - sizeof (struct reiserfs_xattr_header);
-        goto out_dput;
-    }
-
-    if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) {
-        err = -ERANGE;
-        goto out_dput;
-    }
-
-    while (file_pos < isize) {
-        size_t chunk;
-        char *data;
-        size_t skip = 0;
-        if (isize - file_pos > PAGE_CACHE_SIZE)
-            chunk = PAGE_CACHE_SIZE;
-        else
-            chunk = isize - file_pos;
-
-        page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT);
-        if (IS_ERR (page)) {
-            err = PTR_ERR (page);
-            goto out_dput;
-        }
-
-        lock_page (page);
-        data = page_address (page);
-        if (file_pos == 0) {
-            struct reiserfs_xattr_header *rxh =
-                                        (struct reiserfs_xattr_header *)data;
-            skip = file_pos = sizeof (struct reiserfs_xattr_header);
-            chunk -= skip;
-            /* Magic doesn't match up.. */
-            if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) {
-                unlock_page (page);
-                reiserfs_put_page (page);
-                reiserfs_warning (inode->i_sb, "Invalid magic for xattr (%s) "
-                                  "associated with %k", name,
-                                  INODE_PKEY (inode));
-                err = -EIO;
-                goto out_dput;
-            }
-            hash = le32_to_cpu (rxh->h_hash);
-        }
-        memcpy (buffer + buffer_pos, data + skip, chunk);
-        unlock_page (page);
-        reiserfs_put_page (page);
-        file_pos += chunk;
-        buffer_pos += chunk;
-        skip = 0;
-    }
-    err = isize - sizeof (struct reiserfs_xattr_header);
-
-    if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) {
-        reiserfs_warning (inode->i_sb, "Invalid hash for xattr (%s) associated "
-                          "with %k", name, INODE_PKEY (inode));
-        err = -EIO;
-    }
-
-out_dput:
-    fput(fp);
-
-out:
-    return err;
+	ssize_t err = 0;
+	struct file *fp;
+	size_t isize;
+	size_t file_pos = 0;
+	size_t buffer_pos = 0;
+	struct page *page;
+	struct inode *xinode;
+	__u32 hash = 0;
+
+	if (name == NULL)
+		return -EINVAL;
+
+	/* We can't have xattrs attached to v1 items since they don't have
+	 * generation numbers */
+	if (get_inode_sd_version(inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	fp = open_xa_file(inode, name, FL_READONLY);
+	if (IS_ERR(fp)) {
+		err = PTR_ERR(fp);
+		goto out;
+	}
+
+	xinode = fp->f_dentry->d_inode;
+	isize = xinode->i_size;
+	REISERFS_I(inode)->i_flags |= i_has_xattr_dir;
+
+	/* Just return the size needed */
+	if (buffer == NULL) {
+		err = isize - sizeof(struct reiserfs_xattr_header);
+		goto out_dput;
+	}
+
+	if (buffer_size < isize - sizeof(struct reiserfs_xattr_header)) {
+		err = -ERANGE;
+		goto out_dput;
+	}
+
+	while (file_pos < isize) {
+		size_t chunk;
+		char *data;
+		size_t skip = 0;
+		if (isize - file_pos > PAGE_CACHE_SIZE)
+			chunk = PAGE_CACHE_SIZE;
+		else
+			chunk = isize - file_pos;
+
+		page = reiserfs_get_page(xinode, file_pos >> PAGE_CACHE_SHIFT);
+		if (IS_ERR(page)) {
+			err = PTR_ERR(page);
+			goto out_dput;
+		}
+
+		lock_page(page);
+		data = page_address(page);
+		if (file_pos == 0) {
+			struct reiserfs_xattr_header *rxh =
+			    (struct reiserfs_xattr_header *)data;
+			skip = file_pos = sizeof(struct reiserfs_xattr_header);
+			chunk -= skip;
+			/* Magic doesn't match up.. */
+			if (rxh->h_magic != cpu_to_le32(REISERFS_XATTR_MAGIC)) {
+				unlock_page(page);
+				reiserfs_put_page(page);
+				reiserfs_warning(inode->i_sb,
+						 "Invalid magic for xattr (%s) "
+						 "associated with %k", name,
+						 INODE_PKEY(inode));
+				err = -EIO;
+				goto out_dput;
+			}
+			hash = le32_to_cpu(rxh->h_hash);
+		}
+		memcpy(buffer + buffer_pos, data + skip, chunk);
+		unlock_page(page);
+		reiserfs_put_page(page);
+		file_pos += chunk;
+		buffer_pos += chunk;
+		skip = 0;
+	}
+	err = isize - sizeof(struct reiserfs_xattr_header);
+
+	if (xattr_hash(buffer, isize - sizeof(struct reiserfs_xattr_header)) !=
+	    hash) {
+		reiserfs_warning(inode->i_sb,
+				 "Invalid hash for xattr (%s) associated "
+				 "with %k", name, INODE_PKEY(inode));
+		err = -EIO;
+	}
+
+      out_dput:
+	fput(fp);
+
+      out:
+	return err;
 }
 
 static int
-__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen)
+__reiserfs_xattr_del(struct dentry *xadir, const char *name, int namelen)
 {
-    struct dentry *dentry;
-    struct inode *dir = xadir->d_inode;
-    int err = 0;
-
-    dentry = lookup_one_len (name, xadir, namelen);
-    if (IS_ERR (dentry)) {
-        err = PTR_ERR (dentry);
-        goto out;
-    } else if (!dentry->d_inode) {
-        err = -ENODATA;
-        goto out_file;
-    }
-
-    /* Skip directories.. */
-    if (S_ISDIR (dentry->d_inode->i_mode))
-        goto out_file;
-
-    if (!is_reiserfs_priv_object (dentry->d_inode)) {
-        reiserfs_warning (dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
-                                     "priv flag set [parent is %sset].",
-                        le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid),
-                        xadir->d_name.len, xadir->d_name.name, namelen, name,
-                        is_reiserfs_priv_object (xadir->d_inode) ? "" : "not ");
-        dput (dentry);
-        return -EIO;
-    }
-
-    err = dir->i_op->unlink (dir, dentry);
-    if (!err)
-        d_delete (dentry);
-
-out_file:
-    dput (dentry);
-
-out:
-    return err;
-}
+	struct dentry *dentry;
+	struct inode *dir = xadir->d_inode;
+	int err = 0;
+
+	dentry = lookup_one_len(name, xadir, namelen);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out;
+	} else if (!dentry->d_inode) {
+		err = -ENODATA;
+		goto out_file;
+	}
+
+	/* Skip directories.. */
+	if (S_ISDIR(dentry->d_inode->i_mode))
+		goto out_file;
+
+	if (!is_reiserfs_priv_object(dentry->d_inode)) {
+		reiserfs_warning(dir->i_sb, "OID %08x [%.*s/%.*s] doesn't have "
+				 "priv flag set [parent is %sset].",
+				 le32_to_cpu(INODE_PKEY(dentry->d_inode)->
+					     k_objectid), xadir->d_name.len,
+				 xadir->d_name.name, namelen, name,
+				 is_reiserfs_priv_object(xadir->
+							 d_inode) ? "" :
+				 "not ");
+		dput(dentry);
+		return -EIO;
+	}
 
+	err = dir->i_op->unlink(dir, dentry);
+	if (!err)
+		d_delete(dentry);
 
-int
-reiserfs_xattr_del (struct inode *inode, const char *name)
+      out_file:
+	dput(dentry);
+
+      out:
+	return err;
+}
+
+int reiserfs_xattr_del(struct inode *inode, const char *name)
 {
-    struct dentry *dir;
-    int err;
+	struct dentry *dir;
+	int err;
 
-    if (IS_RDONLY (inode))
-        return -EROFS;
+	if (IS_RDONLY(inode))
+		return -EROFS;
 
-    dir = open_xa_dir (inode, FL_READONLY);
-    if (IS_ERR (dir)) {
-        err = PTR_ERR (dir);
-        goto out;
-    }
+	dir = open_xa_dir(inode, FL_READONLY);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	}
 
-    err = __reiserfs_xattr_del (dir, name, strlen (name));
-    dput (dir);
+	err = __reiserfs_xattr_del(dir, name, strlen(name));
+	dput(dir);
 
-    if (!err) {
-        inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty (inode);
-    }
+	if (!err) {
+		inode->i_ctime = CURRENT_TIME_SEC;
+		mark_inode_dirty(inode);
+	}
 
-out:
-    return err;
+      out:
+	return err;
 }
 
 /* The following are side effects of other operations that aren't explicitly
@@ -777,167 +785,163 @@ out:
  * or ownership changes, object deletions, etc. */
 
 static int
-reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen,
-                               loff_t offset, ino_t ino, unsigned int d_type)
+reiserfs_delete_xattrs_filler(void *buf, const char *name, int namelen,
+			      loff_t offset, ino_t ino, unsigned int d_type)
 {
-    struct dentry *xadir = (struct dentry *)buf;
+	struct dentry *xadir = (struct dentry *)buf;
 
-    return __reiserfs_xattr_del (xadir, name, namelen);
+	return __reiserfs_xattr_del(xadir, name, namelen);
 
 }
 
 /* This is called w/ inode->i_sem downed */
-int
-reiserfs_delete_xattrs (struct inode *inode)
+int reiserfs_delete_xattrs(struct inode *inode)
 {
-    struct file *fp;
-    struct dentry *dir, *root;
-    int err = 0;
-
-    /* Skip out, an xattr has no xattrs associated with it */
-    if (is_reiserfs_priv_object (inode) ||
-        get_inode_sd_version (inode) == STAT_DATA_V1 ||
-        !reiserfs_xattrs(inode->i_sb))
-    {
-        return 0;
-    }
-    reiserfs_read_lock_xattrs (inode->i_sb);
-    dir = open_xa_dir (inode, FL_READONLY);
-    reiserfs_read_unlock_xattrs (inode->i_sb);
-    if (IS_ERR (dir)) {
-        err = PTR_ERR (dir);
-        goto out;
-    } else if (!dir->d_inode) {
-        dput (dir);
-        return 0;
-    }
-
-    fp = dentry_open (dir, NULL, O_RDWR);
-    if (IS_ERR (fp)) {
-        err = PTR_ERR (fp);
-        /* dentry_open dputs the dentry if it fails */
-        goto out;
-    }
-
-    lock_kernel ();
-    err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir);
-    if (err) {
-        unlock_kernel ();
-        goto out_dir;
-    }
-
-    /* Leftovers besides . and .. -- that's not good. */
-    if (dir->d_inode->i_nlink <= 2) {
-        root = get_xa_root (inode->i_sb);
-        reiserfs_write_lock_xattrs (inode->i_sb);
-        err = vfs_rmdir (root->d_inode, dir);
-        reiserfs_write_unlock_xattrs (inode->i_sb);
-        dput (root);
-    } else {
-        reiserfs_warning (inode->i_sb,
-                          "Couldn't remove all entries in directory");
-    }
-    unlock_kernel ();
-
-out_dir:
-    fput(fp);
-
-out:
-    if (!err)
-        REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
-    return err;
+	struct file *fp;
+	struct dentry *dir, *root;
+	int err = 0;
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (is_reiserfs_priv_object(inode) ||
+	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	    !reiserfs_xattrs(inode->i_sb)) {
+		return 0;
+	}
+	reiserfs_read_lock_xattrs(inode->i_sb);
+	dir = open_xa_dir(inode, FL_READONLY);
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		goto out;
+	} else if (!dir->d_inode) {
+		dput(dir);
+		return 0;
+	}
+
+	fp = dentry_open(dir, NULL, O_RDWR);
+	if (IS_ERR(fp)) {
+		err = PTR_ERR(fp);
+		/* dentry_open dputs the dentry if it fails */
+		goto out;
+	}
+
+	lock_kernel();
+	err = xattr_readdir(fp, reiserfs_delete_xattrs_filler, dir);
+	if (err) {
+		unlock_kernel();
+		goto out_dir;
+	}
+
+	/* Leftovers besides . and .. -- that's not good. */
+	if (dir->d_inode->i_nlink <= 2) {
+		root = get_xa_root(inode->i_sb);
+		reiserfs_write_lock_xattrs(inode->i_sb);
+		err = vfs_rmdir(root->d_inode, dir);
+		reiserfs_write_unlock_xattrs(inode->i_sb);
+		dput(root);
+	} else {
+		reiserfs_warning(inode->i_sb,
+				 "Couldn't remove all entries in directory");
+	}
+	unlock_kernel();
+
+      out_dir:
+	fput(fp);
+
+      out:
+	if (!err)
+		REISERFS_I(inode)->i_flags =
+		    REISERFS_I(inode)->i_flags & ~i_has_xattr_dir;
+	return err;
 }
 
 struct reiserfs_chown_buf {
-    struct inode *inode;
-    struct dentry *xadir;
-    struct iattr *attrs;
+	struct inode *inode;
+	struct dentry *xadir;
+	struct iattr *attrs;
 };
 
 /* XXX: If there is a better way to do this, I'd love to hear about it */
 static int
-reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen,
-                               loff_t offset, ino_t ino, unsigned int d_type)
+reiserfs_chown_xattrs_filler(void *buf, const char *name, int namelen,
+			     loff_t offset, ino_t ino, unsigned int d_type)
 {
-    struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
-    struct dentry *xafile, *xadir = chown_buf->xadir;
-    struct iattr *attrs = chown_buf->attrs;
-    int err = 0;
-
-    xafile = lookup_one_len (name, xadir, namelen);
-    if (IS_ERR (xafile))
-        return PTR_ERR (xafile);
-    else if (!xafile->d_inode) {
-        dput (xafile);
-        return -ENODATA;
-    }
-
-    if (!S_ISDIR (xafile->d_inode->i_mode))
-        err = notify_change (xafile, attrs);
-    dput (xafile);
-
-    return err;
+	struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf;
+	struct dentry *xafile, *xadir = chown_buf->xadir;
+	struct iattr *attrs = chown_buf->attrs;
+	int err = 0;
+
+	xafile = lookup_one_len(name, xadir, namelen);
+	if (IS_ERR(xafile))
+		return PTR_ERR(xafile);
+	else if (!xafile->d_inode) {
+		dput(xafile);
+		return -ENODATA;
+	}
+
+	if (!S_ISDIR(xafile->d_inode->i_mode))
+		err = notify_change(xafile, attrs);
+	dput(xafile);
+
+	return err;
 }
 
-int
-reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs)
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs)
 {
-    struct file *fp;
-    struct dentry *dir;
-    int err = 0;
-    struct reiserfs_chown_buf buf;
-    unsigned int ia_valid = attrs->ia_valid;
-
-    /* Skip out, an xattr has no xattrs associated with it */
-    if (is_reiserfs_priv_object (inode) ||
-        get_inode_sd_version (inode) == STAT_DATA_V1 ||
-        !reiserfs_xattrs(inode->i_sb))
-    {
-        return 0;
-    }
-    reiserfs_read_lock_xattrs (inode->i_sb);
-    dir = open_xa_dir (inode, FL_READONLY);
-    reiserfs_read_unlock_xattrs (inode->i_sb);
-    if (IS_ERR (dir)) {
-        if (PTR_ERR (dir) != -ENODATA)
-            err = PTR_ERR (dir);
-        goto out;
-    } else if (!dir->d_inode) {
-        dput (dir);
-        goto out;
-    }
-
-    fp = dentry_open (dir, NULL, O_RDWR);
-    if (IS_ERR (fp)) {
-        err = PTR_ERR (fp);
-        /* dentry_open dputs the dentry if it fails */
-        goto out;
-    }
-
-    lock_kernel ();
-
-    attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
-    buf.xadir = dir;
-    buf.attrs = attrs;
-    buf.inode = inode;
-
-    err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf);
-    if (err) {
-        unlock_kernel ();
-        goto out_dir;
-    }
-
-    err = notify_change (dir, attrs);
-    unlock_kernel ();
-
-out_dir:
-    fput(fp);
-
-out:
-    attrs->ia_valid = ia_valid;
-    return err;
-}
+	struct file *fp;
+	struct dentry *dir;
+	int err = 0;
+	struct reiserfs_chown_buf buf;
+	unsigned int ia_valid = attrs->ia_valid;
+
+	/* Skip out, an xattr has no xattrs associated with it */
+	if (is_reiserfs_priv_object(inode) ||
+	    get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	    !reiserfs_xattrs(inode->i_sb)) {
+		return 0;
+	}
+	reiserfs_read_lock_xattrs(inode->i_sb);
+	dir = open_xa_dir(inode, FL_READONLY);
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	if (IS_ERR(dir)) {
+		if (PTR_ERR(dir) != -ENODATA)
+			err = PTR_ERR(dir);
+		goto out;
+	} else if (!dir->d_inode) {
+		dput(dir);
+		goto out;
+	}
+
+	fp = dentry_open(dir, NULL, O_RDWR);
+	if (IS_ERR(fp)) {
+		err = PTR_ERR(fp);
+		/* dentry_open dputs the dentry if it fails */
+		goto out;
+	}
 
+	lock_kernel();
+
+	attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME);
+	buf.xadir = dir;
+	buf.attrs = attrs;
+	buf.inode = inode;
+
+	err = xattr_readdir(fp, reiserfs_chown_xattrs_filler, &buf);
+	if (err) {
+		unlock_kernel();
+		goto out_dir;
+	}
+
+	err = notify_change(dir, attrs);
+	unlock_kernel();
+
+      out_dir:
+	fput(fp);
+
+      out:
+	attrs->ia_valid = ia_valid;
+	return err;
+}
 
 /* Actual operations that are exported to VFS-land */
 
@@ -946,61 +950,60 @@ out:
  * Preliminary locking: we down dentry->d_inode->i_sem
  */
 ssize_t
-reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer,
-                   size_t size)
+reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
+		  size_t size)
 {
-    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
-    int err;
-
-    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
-
-    reiserfs_read_lock_xattr_i (dentry->d_inode);
-    reiserfs_read_lock_xattrs (dentry->d_sb);
-    err = xah->get (dentry->d_inode, name, buffer, size);
-    reiserfs_read_unlock_xattrs (dentry->d_sb);
-    reiserfs_read_unlock_xattr_i (dentry->d_inode);
-    return err;
+	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
+	int err;
+
+	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	reiserfs_read_lock_xattr_i(dentry->d_inode);
+	reiserfs_read_lock_xattrs(dentry->d_sb);
+	err = xah->get(dentry->d_inode, name, buffer, size);
+	reiserfs_read_unlock_xattrs(dentry->d_sb);
+	reiserfs_read_unlock_xattr_i(dentry->d_inode);
+	return err;
 }
 
-
 /*
  * Inode operation setxattr()
  *
  * dentry->d_inode->i_sem down
  */
 int
-reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value,
-                   size_t size, int flags)
+reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+		  size_t size, int flags)
 {
-    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
-    int err;
-    int lock;
-
-    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
-
-    if (IS_RDONLY (dentry->d_inode))
-        return -EROFS;
-
-    if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
-        return -EROFS;
-
-    reiserfs_write_lock_xattr_i (dentry->d_inode);
-    lock = !has_xattr_dir (dentry->d_inode);
-    if (lock)
-        reiserfs_write_lock_xattrs (dentry->d_sb);
-    else
-        reiserfs_read_lock_xattrs (dentry->d_sb);
-    err = xah->set (dentry->d_inode, name, value, size, flags);
-    if (lock)
-        reiserfs_write_unlock_xattrs (dentry->d_sb);
-    else
-        reiserfs_read_unlock_xattrs (dentry->d_sb);
-    reiserfs_write_unlock_xattr_i (dentry->d_inode);
-    return err;
+	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
+	int err;
+	int lock;
+
+	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	if (IS_RDONLY(dentry->d_inode))
+		return -EROFS;
+
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
+		return -EROFS;
+
+	reiserfs_write_lock_xattr_i(dentry->d_inode);
+	lock = !has_xattr_dir(dentry->d_inode);
+	if (lock)
+		reiserfs_write_lock_xattrs(dentry->d_sb);
+	else
+		reiserfs_read_lock_xattrs(dentry->d_sb);
+	err = xah->set(dentry->d_inode, name, value, size, flags);
+	if (lock)
+		reiserfs_write_unlock_xattrs(dentry->d_sb);
+	else
+		reiserfs_read_unlock_xattrs(dentry->d_sb);
+	reiserfs_write_unlock_xattr_i(dentry->d_inode);
+	return err;
 }
 
 /*
@@ -1008,344 +1011,343 @@ reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value,
  *
  * dentry->d_inode->i_sem down
  */
-int
-reiserfs_removexattr (struct dentry *dentry, const char *name)
+int reiserfs_removexattr(struct dentry *dentry, const char *name)
 {
-    int err;
-    struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
+	int err;
+	struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix(name);
 
-    if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
-        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
+	if (!xah || !reiserfs_xattrs(dentry->d_sb) ||
+	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
 
-    if (IS_RDONLY (dentry->d_inode))
-        return -EROFS;
+	if (IS_RDONLY(dentry->d_inode))
+		return -EROFS;
 
-    if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode))
-        return -EPERM;
+	if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode))
+		return -EPERM;
 
-    reiserfs_write_lock_xattr_i (dentry->d_inode);
-    reiserfs_read_lock_xattrs (dentry->d_sb);
+	reiserfs_write_lock_xattr_i(dentry->d_inode);
+	reiserfs_read_lock_xattrs(dentry->d_sb);
 
-    /* Deletion pre-operation */
-    if (xah->del) {
-        err = xah->del (dentry->d_inode, name);
-        if (err)
-            goto out;
-    }
+	/* Deletion pre-operation */
+	if (xah->del) {
+		err = xah->del(dentry->d_inode, name);
+		if (err)
+			goto out;
+	}
 
-    err = reiserfs_xattr_del (dentry->d_inode, name);
+	err = reiserfs_xattr_del(dentry->d_inode, name);
 
-    dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
-    mark_inode_dirty (dentry->d_inode);
+	dentry->d_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(dentry->d_inode);
 
-out:
-    reiserfs_read_unlock_xattrs (dentry->d_sb);
-    reiserfs_write_unlock_xattr_i (dentry->d_inode);
-    return err;
+      out:
+	reiserfs_read_unlock_xattrs(dentry->d_sb);
+	reiserfs_write_unlock_xattr_i(dentry->d_inode);
+	return err;
 }
 
-
 /* This is what filldir will use:
  * r_pos will always contain the amount of space required for the entire
  * list. If r_pos becomes larger than r_size, we need more space and we
  * return an error indicating this. If r_pos is less than r_size, then we've
  * filled the buffer successfully and we return success */
 struct reiserfs_listxattr_buf {
-    int r_pos;
-    int r_size;
-    char *r_buf;
-    struct inode *r_inode;
+	int r_pos;
+	int r_size;
+	char *r_buf;
+	struct inode *r_inode;
 };
 
 static int
-reiserfs_listxattr_filler (void *buf, const char *name, int namelen,
-                           loff_t offset, ino_t ino, unsigned int d_type)
+reiserfs_listxattr_filler(void *buf, const char *name, int namelen,
+			  loff_t offset, ino_t ino, unsigned int d_type)
 {
-    struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
-    int len = 0;
-    if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
-        struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name);
-        if (!xah) return 0; /* Unsupported xattr name, skip it */
-
-        /* We call ->list() twice because the operation isn't required to just
-         * return the name back - we want to make sure we have enough space */
-        len += xah->list (b->r_inode, name, namelen, NULL);
-
-        if (len) {
-            if (b->r_pos + len + 1 <= b->r_size) {
-                char *p = b->r_buf + b->r_pos;
-                p += xah->list (b->r_inode, name, namelen, p);
-                *p++ = '\0';
-            }
-            b->r_pos += len + 1;
-        }
-    }
-
-    return 0;
+	struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf;
+	int len = 0;
+	if (name[0] != '.'
+	    || (namelen != 1 && (name[1] != '.' || namelen != 2))) {
+		struct reiserfs_xattr_handler *xah =
+		    find_xattr_handler_prefix(name);
+		if (!xah)
+			return 0;	/* Unsupported xattr name, skip it */
+
+		/* We call ->list() twice because the operation isn't required to just
+		 * return the name back - we want to make sure we have enough space */
+		len += xah->list(b->r_inode, name, namelen, NULL);
+
+		if (len) {
+			if (b->r_pos + len + 1 <= b->r_size) {
+				char *p = b->r_buf + b->r_pos;
+				p += xah->list(b->r_inode, name, namelen, p);
+				*p++ = '\0';
+			}
+			b->r_pos += len + 1;
+		}
+	}
+
+	return 0;
 }
+
 /*
  * Inode operation listxattr()
  *
  * Preliminary locking: we down dentry->d_inode->i_sem
  */
-ssize_t
-reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size)
+ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 {
-    struct file *fp;
-    struct dentry *dir;
-    int err = 0;
-    struct reiserfs_listxattr_buf buf;
-
-    if (!dentry->d_inode)
-        return -EINVAL;
-
-    if (!reiserfs_xattrs(dentry->d_sb) ||
-        get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1)
-        return -EOPNOTSUPP;
-
-    reiserfs_read_lock_xattr_i (dentry->d_inode);
-    reiserfs_read_lock_xattrs (dentry->d_sb);
-    dir = open_xa_dir (dentry->d_inode, FL_READONLY);
-    reiserfs_read_unlock_xattrs (dentry->d_sb);
-    if (IS_ERR (dir)) {
-        err = PTR_ERR (dir);
-        if (err == -ENODATA)
-            err = 0; /* Not an error if there aren't any xattrs */
-        goto out;
-    }
-
-    fp = dentry_open (dir, NULL, O_RDWR);
-    if (IS_ERR (fp)) {
-        err = PTR_ERR (fp);
-        /* dentry_open dputs the dentry if it fails */
-        goto out;
-    }
-
-    buf.r_buf = buffer;
-    buf.r_size = buffer ? size : 0;
-    buf.r_pos = 0;
-    buf.r_inode = dentry->d_inode;
-
-    REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
-
-    err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf);
-    if (err)
-        goto out_dir;
-
-    if (buf.r_pos > buf.r_size && buffer != NULL)
-        err = -ERANGE;
-    else
-        err = buf.r_pos;
-
-out_dir:
-    fput(fp);
-
-out:
-    reiserfs_read_unlock_xattr_i (dentry->d_inode);
-    return err;
+	struct file *fp;
+	struct dentry *dir;
+	int err = 0;
+	struct reiserfs_listxattr_buf buf;
+
+	if (!dentry->d_inode)
+		return -EINVAL;
+
+	if (!reiserfs_xattrs(dentry->d_sb) ||
+	    get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1)
+		return -EOPNOTSUPP;
+
+	reiserfs_read_lock_xattr_i(dentry->d_inode);
+	reiserfs_read_lock_xattrs(dentry->d_sb);
+	dir = open_xa_dir(dentry->d_inode, FL_READONLY);
+	reiserfs_read_unlock_xattrs(dentry->d_sb);
+	if (IS_ERR(dir)) {
+		err = PTR_ERR(dir);
+		if (err == -ENODATA)
+			err = 0;	/* Not an error if there aren't any xattrs */
+		goto out;
+	}
+
+	fp = dentry_open(dir, NULL, O_RDWR);
+	if (IS_ERR(fp)) {
+		err = PTR_ERR(fp);
+		/* dentry_open dputs the dentry if it fails */
+		goto out;
+	}
+
+	buf.r_buf = buffer;
+	buf.r_size = buffer ? size : 0;
+	buf.r_pos = 0;
+	buf.r_inode = dentry->d_inode;
+
+	REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir;
+
+	err = xattr_readdir(fp, reiserfs_listxattr_filler, &buf);
+	if (err)
+		goto out_dir;
+
+	if (buf.r_pos > buf.r_size && buffer != NULL)
+		err = -ERANGE;
+	else
+		err = buf.r_pos;
+
+      out_dir:
+	fput(fp);
+
+      out:
+	reiserfs_read_unlock_xattr_i(dentry->d_inode);
+	return err;
 }
 
 /* This is the implementation for the xattr plugin infrastructure */
-static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers);
+static struct list_head xattr_handlers = LIST_HEAD_INIT(xattr_handlers);
 static DEFINE_RWLOCK(handler_lock);
 
-static struct reiserfs_xattr_handler *
-find_xattr_handler_prefix (const char *prefix)
+static struct reiserfs_xattr_handler *find_xattr_handler_prefix(const char
+								*prefix)
 {
-    struct reiserfs_xattr_handler *xah = NULL;
-    struct list_head *p;
-
-    read_lock (&handler_lock);
-    list_for_each (p, &xattr_handlers) {
-        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
-        if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0)
-            break;
-        xah = NULL;
-    }
-
-    read_unlock (&handler_lock);
-    return xah;
+	struct reiserfs_xattr_handler *xah = NULL;
+	struct list_head *p;
+
+	read_lock(&handler_lock);
+	list_for_each(p, &xattr_handlers) {
+		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
+		if (strncmp(xah->prefix, prefix, strlen(xah->prefix)) == 0)
+			break;
+		xah = NULL;
+	}
+
+	read_unlock(&handler_lock);
+	return xah;
 }
 
-static void
-__unregister_handlers (void)
+static void __unregister_handlers(void)
 {
-    struct reiserfs_xattr_handler *xah;
-    struct list_head *p, *tmp;
+	struct reiserfs_xattr_handler *xah;
+	struct list_head *p, *tmp;
 
-    list_for_each_safe (p, tmp, &xattr_handlers) {
-        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
-        if (xah->exit)
-            xah->exit();
+	list_for_each_safe(p, tmp, &xattr_handlers) {
+		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
+		if (xah->exit)
+			xah->exit();
 
-        list_del_init (p);
-    }
-    INIT_LIST_HEAD (&xattr_handlers);
+		list_del_init(p);
+	}
+	INIT_LIST_HEAD(&xattr_handlers);
 }
 
-int __init
-reiserfs_xattr_register_handlers (void)
+int __init reiserfs_xattr_register_handlers(void)
 {
-    int err = 0;
-    struct reiserfs_xattr_handler *xah;
-    struct list_head *p;
+	int err = 0;
+	struct reiserfs_xattr_handler *xah;
+	struct list_head *p;
 
-    write_lock (&handler_lock);
+	write_lock(&handler_lock);
 
-    /* If we're already initialized, nothing to do */
-    if (!list_empty (&xattr_handlers)) {
-        write_unlock (&handler_lock);
-        return 0;
-    }
+	/* If we're already initialized, nothing to do */
+	if (!list_empty(&xattr_handlers)) {
+		write_unlock(&handler_lock);
+		return 0;
+	}
 
-    /* Add the handlers */
-    list_add_tail (&user_handler.handlers, &xattr_handlers);
-    list_add_tail (&trusted_handler.handlers, &xattr_handlers);
+	/* Add the handlers */
+	list_add_tail(&user_handler.handlers, &xattr_handlers);
+	list_add_tail(&trusted_handler.handlers, &xattr_handlers);
 #ifdef CONFIG_REISERFS_FS_SECURITY
-    list_add_tail (&security_handler.handlers, &xattr_handlers);
+	list_add_tail(&security_handler.handlers, &xattr_handlers);
 #endif
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-    list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers);
-    list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers);
+	list_add_tail(&posix_acl_access_handler.handlers, &xattr_handlers);
+	list_add_tail(&posix_acl_default_handler.handlers, &xattr_handlers);
 #endif
 
-    /* Run initializers, if available */
-    list_for_each (p, &xattr_handlers) {
-        xah = list_entry (p, struct reiserfs_xattr_handler, handlers);
-        if (xah->init) {
-            err = xah->init ();
-            if (err) {
-                list_del_init (p);
-                break;
-            }
-        }
-    }
-
-    /* Clean up other handlers, if any failed */
-    if (err)
-        __unregister_handlers ();
-
-    write_unlock (&handler_lock);
-    return err;
+	/* Run initializers, if available */
+	list_for_each(p, &xattr_handlers) {
+		xah = list_entry(p, struct reiserfs_xattr_handler, handlers);
+		if (xah->init) {
+			err = xah->init();
+			if (err) {
+				list_del_init(p);
+				break;
+			}
+		}
+	}
+
+	/* Clean up other handlers, if any failed */
+	if (err)
+		__unregister_handlers();
+
+	write_unlock(&handler_lock);
+	return err;
 }
 
-void
-reiserfs_xattr_unregister_handlers (void)
+void reiserfs_xattr_unregister_handlers(void)
 {
-    write_lock (&handler_lock);
-    __unregister_handlers ();
-    write_unlock (&handler_lock);
+	write_lock(&handler_lock);
+	__unregister_handlers();
+	write_unlock(&handler_lock);
 }
 
 /* This will catch lookups from the fs root to .reiserfs_priv */
 static int
-xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name)
+xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name)
 {
-    struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
-    if (name->len == priv_root->d_name.len &&
-        name->hash == priv_root->d_name.hash &&
-        !memcmp (name->name, priv_root->d_name.name, name->len)) {
-            return -ENOENT;
-    } else if (q1->len == name->len &&
-               !memcmp(q1->name, name->name, name->len))
-        return 0;
-    return 1;
+	struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root;
+	if (name->len == priv_root->d_name.len &&
+	    name->hash == priv_root->d_name.hash &&
+	    !memcmp(name->name, priv_root->d_name.name, name->len)) {
+		return -ENOENT;
+	} else if (q1->len == name->len &&
+		   !memcmp(q1->name, name->name, name->len))
+		return 0;
+	return 1;
 }
 
 static struct dentry_operations xattr_lookup_poison_ops = {
-    .d_compare = xattr_lookup_poison,
+	.d_compare = xattr_lookup_poison,
 };
 
-
 /* We need to take a copy of the mount flags since things like
  * MS_RDONLY don't get set until *after* we're called.
  * mount_flags != mount_options */
-int
-reiserfs_xattr_init (struct super_block *s, int mount_flags)
+int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 {
-  int err = 0;
-
-  /* We need generation numbers to ensure that the oid mapping is correct
-   * v3.5 filesystems don't have them. */
-  if (!old_format_only (s)) {
-    set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-  } else if (reiserfs_xattrs_optional (s)) {
-    /* Old format filesystem, but optional xattrs have been enabled
-     * at mount time. Error out. */
-    reiserfs_warning (s, "xattrs/ACLs not supported on pre v3.6 "
-                      "format filesystem. Failing mount.");
-    err = -EOPNOTSUPP;
-    goto error;
-  } else {
-    /* Old format filesystem, but no optional xattrs have been enabled. This
-     * means we silently disable xattrs on the filesystem. */
-    clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-  }
-
-  /* If we don't have the privroot located yet - go find it */
-  if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) {
-      struct dentry *dentry;
-      dentry = lookup_one_len (PRIVROOT_NAME, s->s_root,
-                               strlen (PRIVROOT_NAME));
-      if (!IS_ERR (dentry)) {
-        if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
-            struct inode *inode = dentry->d_parent->d_inode;
-            down (&inode->i_sem);
-            err = inode->i_op->mkdir (inode, dentry, 0700);
-            up (&inode->i_sem);
-            if (err) {
-                dput (dentry);
-                dentry = NULL;
-            }
-
-            if (dentry && dentry->d_inode)
-                reiserfs_warning (s, "Created %s on %s - reserved for "
-                                  "xattr storage.", PRIVROOT_NAME,
-                                  reiserfs_bdevname (inode->i_sb));
-        } else if (!dentry->d_inode) {
-            dput (dentry);
-            dentry = NULL;
-        }
-      } else
-        err = PTR_ERR (dentry);
-
-      if (!err && dentry) {
-          s->s_root->d_op = &xattr_lookup_poison_ops;
-          reiserfs_mark_inode_private (dentry->d_inode);
-          REISERFS_SB(s)->priv_root = dentry;
-      } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */
-          /* If we're read-only it just means that the dir hasn't been
-           * created. Not an error -- just no xattrs on the fs. We'll
-           * check again if we go read-write */
-          reiserfs_warning (s, "xattrs/ACLs enabled and couldn't "
-                            "find/create .reiserfs_priv. Failing mount.");
-          err = -EOPNOTSUPP;
-      }
-  }
-
-error:
-   /* This is only nonzero if there was an error initializing the xattr
-    * directory or if there is a condition where we don't support them. */
-    if (err) {
-          clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
-          clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
-          clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
-    }
-
-    /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
-    s->s_flags = s->s_flags & ~MS_POSIXACL;
-    if (reiserfs_posixacl (s))
-	s->s_flags |= MS_POSIXACL;
-
-    return err;
+	int err = 0;
+
+	/* We need generation numbers to ensure that the oid mapping is correct
+	 * v3.5 filesystems don't have them. */
+	if (!old_format_only(s)) {
+		set_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+	} else if (reiserfs_xattrs_optional(s)) {
+		/* Old format filesystem, but optional xattrs have been enabled
+		 * at mount time. Error out. */
+		reiserfs_warning(s, "xattrs/ACLs not supported on pre v3.6 "
+				 "format filesystem. Failing mount.");
+		err = -EOPNOTSUPP;
+		goto error;
+	} else {
+		/* Old format filesystem, but no optional xattrs have been enabled. This
+		 * means we silently disable xattrs on the filesystem. */
+		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+	}
+
+	/* If we don't have the privroot located yet - go find it */
+	if (reiserfs_xattrs(s) && !REISERFS_SB(s)->priv_root) {
+		struct dentry *dentry;
+		dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
+					strlen(PRIVROOT_NAME));
+		if (!IS_ERR(dentry)) {
+			if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) {
+				struct inode *inode = dentry->d_parent->d_inode;
+				down(&inode->i_sem);
+				err = inode->i_op->mkdir(inode, dentry, 0700);
+				up(&inode->i_sem);
+				if (err) {
+					dput(dentry);
+					dentry = NULL;
+				}
+
+				if (dentry && dentry->d_inode)
+					reiserfs_warning(s,
+							 "Created %s on %s - reserved for "
+							 "xattr storage.",
+							 PRIVROOT_NAME,
+							 reiserfs_bdevname
+							 (inode->i_sb));
+			} else if (!dentry->d_inode) {
+				dput(dentry);
+				dentry = NULL;
+			}
+		} else
+			err = PTR_ERR(dentry);
+
+		if (!err && dentry) {
+			s->s_root->d_op = &xattr_lookup_poison_ops;
+			reiserfs_mark_inode_private(dentry->d_inode);
+			REISERFS_SB(s)->priv_root = dentry;
+		} else if (!(mount_flags & MS_RDONLY)) {	/* xattrs are unavailable */
+			/* If we're read-only it just means that the dir hasn't been
+			 * created. Not an error -- just no xattrs on the fs. We'll
+			 * check again if we go read-write */
+			reiserfs_warning(s, "xattrs/ACLs enabled and couldn't "
+					 "find/create .reiserfs_priv. Failing mount.");
+			err = -EOPNOTSUPP;
+		}
+	}
+
+      error:
+	/* This is only nonzero if there was an error initializing the xattr
+	 * directory or if there is a condition where we don't support them. */
+	if (err) {
+		clear_bit(REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt));
+		clear_bit(REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt));
+		clear_bit(REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt));
+	}
+
+	/* The super_block MS_POSIXACL must mirror the (no)acl mount option. */
+	s->s_flags = s->s_flags & ~MS_POSIXACL;
+	if (reiserfs_posixacl(s))
+		s->s_flags |= MS_POSIXACL;
+
+	return err;
 }
 
 static int
-__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd,
-                       int need_lock)
+__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd,
+		      int need_lock)
 {
-	umode_t			mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 
 	if (mask & MAY_WRITE) {
 		/*
@@ -1363,50 +1365,50 @@ __reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd,
 	}
 
 	/* We don't do permission checks on the internal objects.
-	* Permissions are determined by the "owning" object. */
-        if (is_reiserfs_priv_object (inode))
+	 * Permissions are determined by the "owning" object. */
+	if (is_reiserfs_priv_object(inode))
 		return 0;
 
 	if (current->fsuid == inode->i_uid) {
 		mode >>= 6;
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
 	} else if (reiserfs_posixacl(inode->i_sb) &&
-                   get_inode_sd_version (inode) != STAT_DATA_V1) {
-                struct posix_acl *acl;
+		   get_inode_sd_version(inode) != STAT_DATA_V1) {
+		struct posix_acl *acl;
 
 		/* ACL can't contain additional permissions if
 		   the ACL_MASK entry is 0 */
 		if (!(mode & S_IRWXG))
 			goto check_groups;
 
-                if (need_lock) {
-		    reiserfs_read_lock_xattr_i (inode);
-                    reiserfs_read_lock_xattrs (inode->i_sb);
+		if (need_lock) {
+			reiserfs_read_lock_xattr_i(inode);
+			reiserfs_read_lock_xattrs(inode->i_sb);
+		}
+		acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+		if (need_lock) {
+			reiserfs_read_unlock_xattrs(inode->i_sb);
+			reiserfs_read_unlock_xattr_i(inode);
 		}
-                acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS);
-                if (need_lock) {
-                    reiserfs_read_unlock_xattrs (inode->i_sb);
-		    reiserfs_read_unlock_xattr_i (inode);
+		if (IS_ERR(acl)) {
+			if (PTR_ERR(acl) == -ENODATA)
+				goto check_groups;
+			return PTR_ERR(acl);
 		}
-                if (IS_ERR (acl)) {
-                    if (PTR_ERR (acl) == -ENODATA)
-                        goto check_groups;
-                    return PTR_ERR (acl);
-                }
-
-                if (acl) {
-                    int err = posix_acl_permission (inode, acl, mask);
-                    posix_acl_release (acl);
-                    if (err == -EACCES) {
-                        goto check_capabilities;
-                    }
-                    return err;
+
+		if (acl) {
+			int err = posix_acl_permission(inode, acl, mask);
+			posix_acl_release(acl);
+			if (err == -EACCES) {
+				goto check_capabilities;
+			}
+			return err;
 		} else {
 			goto check_groups;
-                }
+		}
 #endif
 	} else {
-check_groups:
+	      check_groups:
 		if (in_group_p(inode->i_gid))
 			mode >>= 3;
 	}
@@ -1414,10 +1416,10 @@ check_groups:
 	/*
 	 * If the DACs are ok we don't need any capability check.
 	 */
-	if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask))
+	if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask))
 		return 0;
 
-check_capabilities:
+      check_capabilities:
 	/*
 	 * Read/write DACs are always overridable.
 	 * Executable DACs are overridable if at least one exec bit is set.
@@ -1437,14 +1439,13 @@ check_capabilities:
 	return -EACCES;
 }
 
-int
-reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd)
+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd)
 {
-    return __reiserfs_permission (inode, mask, nd, 1);
+	return __reiserfs_permission(inode, mask, nd, 1);
 }
 
 int
-reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd)
+reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd)
 {
-    return __reiserfs_permission (inode, mask, nd, 0);
+	return __reiserfs_permission(inode, mask, nd, 0);
 }
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index c312881c5f53..6703efa3c430 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -9,7 +9,8 @@
 #include <linux/reiserfs_acl.h>
 #include <asm/uaccess.h>
 
-static int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl);
+static int reiserfs_set_acl(struct inode *inode, int type,
+			    struct posix_acl *acl);
 
 static int
 xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
@@ -34,14 +35,13 @@ xattr_set_acl(struct inode *inode, int type, const void *value, size_t size)
 	} else
 		acl = NULL;
 
-	error = reiserfs_set_acl (inode, type, acl);
+	error = reiserfs_set_acl(inode, type, acl);
 
-release_and_out:
+      release_and_out:
 	posix_acl_release(acl);
 	return error;
 }
 
-
 static int
 xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 {
@@ -51,7 +51,7 @@ xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 	if (!reiserfs_posixacl(inode->i_sb))
 		return -EOPNOTSUPP;
 
-	acl = reiserfs_get_acl (inode, type);
+	acl = reiserfs_get_acl(inode, type);
 	if (IS_ERR(acl))
 		return PTR_ERR(acl);
 	if (acl == NULL)
@@ -62,12 +62,10 @@ xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size)
 	return error;
 }
 
-
 /*
  * Convert from filesystem to in-memory representation.
  */
-static struct posix_acl *
-posix_acl_from_disk(const void *value, size_t size)
+static struct posix_acl *posix_acl_from_disk(const void *value, size_t size)
 {
 	const char *end = (char *)value + size;
 	int n, count;
@@ -76,8 +74,8 @@ posix_acl_from_disk(const void *value, size_t size)
 	if (!value)
 		return NULL;
 	if (size < sizeof(reiserfs_acl_header))
-		 return ERR_PTR(-EINVAL);
-	if (((reiserfs_acl_header *)value)->a_version !=
+		return ERR_PTR(-EINVAL);
+	if (((reiserfs_acl_header *) value)->a_version !=
 	    cpu_to_le32(REISERFS_ACL_VERSION))
 		return ERR_PTR(-EINVAL);
 	value = (char *)value + sizeof(reiserfs_acl_header);
@@ -89,41 +87,39 @@ posix_acl_from_disk(const void *value, size_t size)
 	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
-	for (n=0; n < count; n++) {
-		reiserfs_acl_entry *entry =
-			(reiserfs_acl_entry *)value;
+	for (n = 0; n < count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) value;
 		if ((char *)value + sizeof(reiserfs_acl_entry_short) > end)
 			goto fail;
-		acl->a_entries[n].e_tag  = le16_to_cpu(entry->e_tag);
+		acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
 		acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				value = (char *)value +
-					sizeof(reiserfs_acl_entry_short);
-				acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
-				break;
-
-			case ACL_USER:
-			case ACL_GROUP:
-				value = (char *)value + sizeof(reiserfs_acl_entry);
-				if ((char *)value > end)
-					goto fail;
-				acl->a_entries[n].e_id =
-					le32_to_cpu(entry->e_id);
-				break;
-
-			default:
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			value = (char *)value +
+			    sizeof(reiserfs_acl_entry_short);
+			acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			value = (char *)value + sizeof(reiserfs_acl_entry);
+			if ((char *)value > end)
 				goto fail;
+			acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+			break;
+
+		default:
+			goto fail;
 		}
 	}
 	if (value != end)
 		goto fail;
 	return acl;
 
-fail:
+      fail:
 	posix_acl_release(acl);
 	return ERR_PTR(-EINVAL);
 }
@@ -131,46 +127,46 @@ fail:
 /*
  * Convert from in-memory to filesystem representation.
  */
-static void *
-posix_acl_to_disk(const struct posix_acl *acl, size_t *size)
+static void *posix_acl_to_disk(const struct posix_acl *acl, size_t * size)
 {
 	reiserfs_acl_header *ext_acl;
 	char *e;
 	int n;
 
 	*size = reiserfs_acl_size(acl->a_count);
-	ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) +
-		acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS);
+	ext_acl = (reiserfs_acl_header *) kmalloc(sizeof(reiserfs_acl_header) +
+						  acl->a_count *
+						  sizeof(reiserfs_acl_entry),
+						  GFP_NOFS);
 	if (!ext_acl)
 		return ERR_PTR(-ENOMEM);
 	ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION);
 	e = (char *)ext_acl + sizeof(reiserfs_acl_header);
-	for (n=0; n < acl->a_count; n++) {
-		reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e;
-		entry->e_tag  = cpu_to_le16(acl->a_entries[n].e_tag);
+	for (n = 0; n < acl->a_count; n++) {
+		reiserfs_acl_entry *entry = (reiserfs_acl_entry *) e;
+		entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
 		entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
-		switch(acl->a_entries[n].e_tag) {
-			case ACL_USER:
-			case ACL_GROUP:
-				entry->e_id =
-					cpu_to_le32(acl->a_entries[n].e_id);
-				e += sizeof(reiserfs_acl_entry);
-				break;
-
-			case ACL_USER_OBJ:
-			case ACL_GROUP_OBJ:
-			case ACL_MASK:
-			case ACL_OTHER:
-				e += sizeof(reiserfs_acl_entry_short);
-				break;
-
-			default:
-				goto fail;
+		switch (acl->a_entries[n].e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+			e += sizeof(reiserfs_acl_entry);
+			break;
+
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			e += sizeof(reiserfs_acl_entry_short);
+			break;
+
+		default:
+			goto fail;
 		}
 	}
 	return (char *)ext_acl;
 
-fail:
+      fail:
 	kfree(ext_acl);
 	return ERR_PTR(-EINVAL);
 }
@@ -181,59 +177,58 @@ fail:
  * inode->i_sem: down
  * BKL held [before 2.5.x]
  */
-struct posix_acl *
-reiserfs_get_acl(struct inode *inode, int type)
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 {
 	char *name, *value;
 	struct posix_acl *acl, **p_acl;
 	size_t size;
 	int retval;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-
-        switch (type) {
-            case ACL_TYPE_ACCESS:
-                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
-                break;
-            case ACL_TYPE_DEFAULT:
-                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
-                break;
-            default:
-                return ERR_PTR (-EINVAL);
-        }
-
-        if (IS_ERR (*p_acl)) {
-            if (PTR_ERR (*p_acl) == -ENODATA)
-                return NULL;
-        } else if (*p_acl != NULL)
-            return posix_acl_dup (*p_acl);
-
-        size = reiserfs_xattr_get (inode, name, NULL, 0);
-        if ((int)size < 0) {
-            if (size == -ENODATA || size == -ENOSYS) {
-		*p_acl = ERR_PTR (-ENODATA);
-		return NULL;
-            }
-            return ERR_PTR (size);
-        }
+	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		p_acl = &reiserfs_i->i_acl_access;
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		p_acl = &reiserfs_i->i_acl_default;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (IS_ERR(*p_acl)) {
+		if (PTR_ERR(*p_acl) == -ENODATA)
+			return NULL;
+	} else if (*p_acl != NULL)
+		return posix_acl_dup(*p_acl);
+
+	size = reiserfs_xattr_get(inode, name, NULL, 0);
+	if ((int)size < 0) {
+		if (size == -ENODATA || size == -ENOSYS) {
+			*p_acl = ERR_PTR(-ENODATA);
+			return NULL;
+		}
+		return ERR_PTR(size);
+	}
 
-        value = kmalloc (size, GFP_NOFS);
-        if (!value)
-            return ERR_PTR (-ENOMEM);
+	value = kmalloc(size, GFP_NOFS);
+	if (!value)
+		return ERR_PTR(-ENOMEM);
 
 	retval = reiserfs_xattr_get(inode, name, value, size);
 	if (retval == -ENODATA || retval == -ENOSYS) {
 		/* This shouldn't actually happen as it should have
 		   been caught above.. but just in case */
 		acl = NULL;
-		*p_acl = ERR_PTR (-ENODATA);
-        } else if (retval < 0) {
+		*p_acl = ERR_PTR(-ENODATA);
+	} else if (retval < 0) {
 		acl = ERR_PTR(retval);
 	} else {
 		acl = posix_acl_from_disk(value, retval);
-		*p_acl = posix_acl_dup (acl);
-        }
+		*p_acl = posix_acl_dup(acl);
+	}
 
 	kfree(value);
 	return acl;
@@ -248,72 +243,72 @@ reiserfs_get_acl(struct inode *inode, int type)
 static int
 reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 {
-        char *name;
+	char *name;
 	void *value = NULL;
 	struct posix_acl **p_acl;
 	size_t size;
 	int error;
-        struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
 
 	if (S_ISLNK(inode->i_mode))
 		return -EOPNOTSUPP;
 
-        switch (type) {
-            case ACL_TYPE_ACCESS:
-                name = POSIX_ACL_XATTR_ACCESS;
-                p_acl = &reiserfs_i->i_acl_access;
-                if (acl) {
-                    mode_t mode = inode->i_mode;
-                    error = posix_acl_equiv_mode (acl, &mode);
-                    if (error < 0)
-                        return error;
-                    else {
-                        inode->i_mode = mode;
-                        if (error == 0)
-                            acl = NULL;
-                    }
-                }
-                break;
-            case ACL_TYPE_DEFAULT:
-                name = POSIX_ACL_XATTR_DEFAULT;
-                p_acl = &reiserfs_i->i_acl_default;
-                if (!S_ISDIR (inode->i_mode))
-                    return acl ? -EACCES : 0;
-                break;
-            default:
-                return -EINVAL;
-        }
-
- 	if (acl) {
-            value = posix_acl_to_disk(acl, &size);
-            if (IS_ERR(value))
-                return (int)PTR_ERR(value);
-            error = reiserfs_xattr_set(inode, name, value, size, 0);
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		name = POSIX_ACL_XATTR_ACCESS;
+		p_acl = &reiserfs_i->i_acl_access;
+		if (acl) {
+			mode_t mode = inode->i_mode;
+			error = posix_acl_equiv_mode(acl, &mode);
+			if (error < 0)
+				return error;
+			else {
+				inode->i_mode = mode;
+				if (error == 0)
+					acl = NULL;
+			}
+		}
+		break;
+	case ACL_TYPE_DEFAULT:
+		name = POSIX_ACL_XATTR_DEFAULT;
+		p_acl = &reiserfs_i->i_acl_default;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		value = posix_acl_to_disk(acl, &size);
+		if (IS_ERR(value))
+			return (int)PTR_ERR(value);
+		error = reiserfs_xattr_set(inode, name, value, size, 0);
 	} else {
-            error = reiserfs_xattr_del (inode, name);
-            if (error == -ENODATA) {
-                /* This may seem odd here, but it means that the ACL was set
-                 * with a value representable with mode bits. If there was
-                 * an ACL before, reiserfs_xattr_del already dirtied the inode.
-                 */
-                mark_inode_dirty (inode);
-                error = 0;
-            }
-        }
+		error = reiserfs_xattr_del(inode, name);
+		if (error == -ENODATA) {
+			/* This may seem odd here, but it means that the ACL was set
+			 * with a value representable with mode bits. If there was
+			 * an ACL before, reiserfs_xattr_del already dirtied the inode.
+			 */
+			mark_inode_dirty(inode);
+			error = 0;
+		}
+	}
 
 	if (value)
 		kfree(value);
 
-        if (!error) {
-            /* Release the old one */
-            if (!IS_ERR (*p_acl) && *p_acl)
-                posix_acl_release (*p_acl);
+	if (!error) {
+		/* Release the old one */
+		if (!IS_ERR(*p_acl) && *p_acl)
+			posix_acl_release(*p_acl);
 
-            if (acl == NULL)
-                *p_acl = ERR_PTR (-ENODATA);
-            else
-                *p_acl = posix_acl_dup (acl);
-        }
+		if (acl == NULL)
+			*p_acl = ERR_PTR(-ENODATA);
+		else
+			*p_acl = posix_acl_dup(acl);
+	}
 
 	return error;
 }
@@ -321,192 +316,190 @@ reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 /* dir->i_sem: down,
  * inode is new and not released into the wild yet */
 int
-reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode)
+reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
 {
-    struct posix_acl *acl;
-    int err = 0;
-
-    /* ACLs only get applied to files and directories */
-    if (S_ISLNK (inode->i_mode))
-        return 0;
-
-    /* ACLs can only be used on "new" objects, so if it's an old object
-     * there is nothing to inherit from */
-    if (get_inode_sd_version (dir) == STAT_DATA_V1)
-        goto apply_umask;
-
-    /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
-     * would be useless since permissions are ignored, and a pain because
-     * it introduces locking cycles */
-    if (is_reiserfs_priv_object (dir)) {
-        reiserfs_mark_inode_private (inode);
-        goto apply_umask;
-    }
-
-    acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT);
-    if (IS_ERR (acl)) {
-        if (PTR_ERR (acl) == -ENODATA)
-            goto apply_umask;
-        return PTR_ERR (acl);
-    }
-
-    if (acl) {
-        struct posix_acl *acl_copy;
-        mode_t mode = inode->i_mode;
-        int need_acl;
-
-        /* Copy the default ACL to the default ACL of a new directory */
-        if (S_ISDIR (inode->i_mode)) {
-            err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl);
-            if (err)
-                goto cleanup;
-        }
-
-        /* Now we reconcile the new ACL and the mode,
-           potentially modifying both */
-        acl_copy = posix_acl_clone (acl, GFP_NOFS);
-        if (!acl_copy) {
-            err = -ENOMEM;
-            goto cleanup;
-        }
-
-
-        need_acl = posix_acl_create_masq (acl_copy, &mode);
-        if (need_acl >= 0) {
-            if (mode != inode->i_mode) {
-                inode->i_mode = mode;
-            }
-
-            /* If we need an ACL.. */
-            if (need_acl > 0) {
-                err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy);
-                if (err)
-                    goto cleanup_copy;
-            }
-        }
-cleanup_copy:
-        posix_acl_release (acl_copy);
-cleanup:
-        posix_acl_release (acl);
-    } else {
-apply_umask:
-        /* no ACL, apply umask */
-        inode->i_mode &= ~current->fs->umask;
-    }
-
-    return err;
+	struct posix_acl *acl;
+	int err = 0;
+
+	/* ACLs only get applied to files and directories */
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	/* ACLs can only be used on "new" objects, so if it's an old object
+	 * there is nothing to inherit from */
+	if (get_inode_sd_version(dir) == STAT_DATA_V1)
+		goto apply_umask;
+
+	/* Don't apply ACLs to objects in the .reiserfs_priv tree.. This
+	 * would be useless since permissions are ignored, and a pain because
+	 * it introduces locking cycles */
+	if (is_reiserfs_priv_object(dir)) {
+		reiserfs_mark_inode_private(inode);
+		goto apply_umask;
+	}
+
+	acl = reiserfs_get_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl)) {
+		if (PTR_ERR(acl) == -ENODATA)
+			goto apply_umask;
+		return PTR_ERR(acl);
+	}
+
+	if (acl) {
+		struct posix_acl *acl_copy;
+		mode_t mode = inode->i_mode;
+		int need_acl;
+
+		/* Copy the default ACL to the default ACL of a new directory */
+		if (S_ISDIR(inode->i_mode)) {
+			err = reiserfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+			if (err)
+				goto cleanup;
+		}
+
+		/* Now we reconcile the new ACL and the mode,
+		   potentially modifying both */
+		acl_copy = posix_acl_clone(acl, GFP_NOFS);
+		if (!acl_copy) {
+			err = -ENOMEM;
+			goto cleanup;
+		}
+
+		need_acl = posix_acl_create_masq(acl_copy, &mode);
+		if (need_acl >= 0) {
+			if (mode != inode->i_mode) {
+				inode->i_mode = mode;
+			}
+
+			/* If we need an ACL.. */
+			if (need_acl > 0) {
+				err =
+				    reiserfs_set_acl(inode, ACL_TYPE_ACCESS,
+						     acl_copy);
+				if (err)
+					goto cleanup_copy;
+			}
+		}
+	      cleanup_copy:
+		posix_acl_release(acl_copy);
+	      cleanup:
+		posix_acl_release(acl);
+	} else {
+	      apply_umask:
+		/* no ACL, apply umask */
+		inode->i_mode &= ~current->fs->umask;
+	}
+
+	return err;
 }
 
 /* Looks up and caches the result of the default ACL.
  * We do this so that we don't need to carry the xattr_sem into
  * reiserfs_new_inode if we don't need to */
-int
-reiserfs_cache_default_acl (struct inode *inode)
+int reiserfs_cache_default_acl(struct inode *inode)
 {
-    int ret = 0;
-    if (reiserfs_posixacl (inode->i_sb) &&
-        !is_reiserfs_priv_object (inode)) {
-        struct posix_acl *acl;
-        reiserfs_read_lock_xattr_i (inode);
-        reiserfs_read_lock_xattrs (inode->i_sb);
-        acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT);
-        reiserfs_read_unlock_xattrs (inode->i_sb);
-        reiserfs_read_unlock_xattr_i (inode);
-        ret = acl ? 1 : 0;
-        posix_acl_release (acl);
-    }
-
-    return ret;
+	int ret = 0;
+	if (reiserfs_posixacl(inode->i_sb) && !is_reiserfs_priv_object(inode)) {
+		struct posix_acl *acl;
+		reiserfs_read_lock_xattr_i(inode);
+		reiserfs_read_lock_xattrs(inode->i_sb);
+		acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT);
+		reiserfs_read_unlock_xattrs(inode->i_sb);
+		reiserfs_read_unlock_xattr_i(inode);
+		ret = acl ? 1 : 0;
+		posix_acl_release(acl);
+	}
+
+	return ret;
 }
 
-int
-reiserfs_acl_chmod (struct inode *inode)
+int reiserfs_acl_chmod(struct inode *inode)
 {
-        struct posix_acl *acl, *clone;
-        int error;
+	struct posix_acl *acl, *clone;
+	int error;
 
-        if (S_ISLNK(inode->i_mode))
-                return -EOPNOTSUPP;
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
 
-	if (get_inode_sd_version (inode) == STAT_DATA_V1 ||
-	    !reiserfs_posixacl(inode->i_sb))
-        {
-	    return 0;
+	if (get_inode_sd_version(inode) == STAT_DATA_V1 ||
+	    !reiserfs_posixacl(inode->i_sb)) {
+		return 0;
 	}
 
-        reiserfs_read_lock_xattrs (inode->i_sb);
-        acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
-        reiserfs_read_unlock_xattrs (inode->i_sb);
-        if (!acl)
-                return 0;
-        if (IS_ERR(acl))
-                return PTR_ERR(acl);
-        clone = posix_acl_clone(acl, GFP_NOFS);
-        posix_acl_release(acl);
-        if (!clone)
-                return -ENOMEM;
-        error = posix_acl_chmod_masq(clone, inode->i_mode);
-        if (!error) {
-                int lock = !has_xattr_dir (inode);
-                reiserfs_write_lock_xattr_i (inode);
-                if (lock)
-                    reiserfs_write_lock_xattrs (inode->i_sb);
-                else
-                    reiserfs_read_lock_xattrs (inode->i_sb);
-                error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
-                if (lock)
-                    reiserfs_write_unlock_xattrs (inode->i_sb);
-                else
-                    reiserfs_read_unlock_xattrs (inode->i_sb);
-                reiserfs_write_unlock_xattr_i (inode);
-        }
-        posix_acl_release(clone);
-        return error;
+	reiserfs_read_lock_xattrs(inode->i_sb);
+	acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS);
+	reiserfs_read_unlock_xattrs(inode->i_sb);
+	if (!acl)
+		return 0;
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	clone = posix_acl_clone(acl, GFP_NOFS);
+	posix_acl_release(acl);
+	if (!clone)
+		return -ENOMEM;
+	error = posix_acl_chmod_masq(clone, inode->i_mode);
+	if (!error) {
+		int lock = !has_xattr_dir(inode);
+		reiserfs_write_lock_xattr_i(inode);
+		if (lock)
+			reiserfs_write_lock_xattrs(inode->i_sb);
+		else
+			reiserfs_read_lock_xattrs(inode->i_sb);
+		error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone);
+		if (lock)
+			reiserfs_write_unlock_xattrs(inode->i_sb);
+		else
+			reiserfs_read_unlock_xattrs(inode->i_sb);
+		reiserfs_write_unlock_xattr_i(inode);
+	}
+	posix_acl_release(clone);
+	return error;
 }
 
 static int
 posix_acl_access_get(struct inode *inode, const char *name,
-			  void *buffer, size_t size)
+		     void *buffer, size_t size)
 {
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
 		return -EINVAL;
 	return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
 }
 
 static int
 posix_acl_access_set(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags)
+		     const void *value, size_t size, int flags)
 {
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
 		return -EINVAL;
 	return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
 }
 
-static int
-posix_acl_access_del (struct inode *inode, const char *name)
+static int posix_acl_access_del(struct inode *inode, const char *name)
 {
-    struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-    struct posix_acl **acl = &reiserfs_i->i_acl_access;
-    if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS)-1)
-	return -EINVAL;
-    if (!IS_ERR (*acl) && *acl) {
-        posix_acl_release (*acl);
-        *acl = ERR_PTR (-ENODATA);
-    }
-
-    return 0;
+	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+	struct posix_acl **acl = &reiserfs_i->i_acl_access;
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_ACCESS) - 1)
+		return -EINVAL;
+	if (!IS_ERR(*acl) && *acl) {
+		posix_acl_release(*acl);
+		*acl = ERR_PTR(-ENODATA);
+	}
+
+	return 0;
 }
 
 static int
-posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out)
+posix_acl_access_list(struct inode *inode, const char *name, int namelen,
+		      char *out)
 {
-    int len = namelen;
-    if (!reiserfs_posixacl (inode->i_sb))
-        return 0;
-    if (out)
-        memcpy (out, name, len);
+	int len = namelen;
+	if (!reiserfs_posixacl(inode->i_sb))
+		return 0;
+	if (out)
+		memcpy(out, name, len);
 
-    return len;
+	return len;
 }
 
 struct reiserfs_xattr_handler posix_acl_access_handler = {
@@ -518,48 +511,48 @@ struct reiserfs_xattr_handler posix_acl_access_handler = {
 };
 
 static int
-posix_acl_default_get (struct inode *inode, const char *name,
-			   void *buffer, size_t size)
+posix_acl_default_get(struct inode *inode, const char *name,
+		      void *buffer, size_t size)
 {
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
 		return -EINVAL;
 	return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
 }
 
 static int
 posix_acl_default_set(struct inode *inode, const char *name,
-			   const void *value, size_t size, int flags)
+		      const void *value, size_t size, int flags)
 {
-	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
 		return -EINVAL;
 	return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
 }
 
-static int
-posix_acl_default_del (struct inode *inode, const char *name)
+static int posix_acl_default_del(struct inode *inode, const char *name)
 {
-    struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
-    struct posix_acl **acl = &reiserfs_i->i_acl_default;
-    if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT)-1)
-	return -EINVAL;
-    if (!IS_ERR (*acl) && *acl) {
-        posix_acl_release (*acl);
-        *acl = ERR_PTR (-ENODATA);
-    }
-
-    return 0;
+	struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode);
+	struct posix_acl **acl = &reiserfs_i->i_acl_default;
+	if (strlen(name) != sizeof(POSIX_ACL_XATTR_DEFAULT) - 1)
+		return -EINVAL;
+	if (!IS_ERR(*acl) && *acl) {
+		posix_acl_release(*acl);
+		*acl = ERR_PTR(-ENODATA);
+	}
+
+	return 0;
 }
 
 static int
-posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out)
+posix_acl_default_list(struct inode *inode, const char *name, int namelen,
+		       char *out)
 {
-    int len = namelen;
-    if (!reiserfs_posixacl (inode->i_sb))
-        return 0;
-    if (out)
-        memcpy (out, name, len);
+	int len = namelen;
+	if (!reiserfs_posixacl(inode->i_sb))
+		return 0;
+	if (out)
+		memcpy(out, name, len);
 
-    return len;
+	return len;
 }
 
 struct reiserfs_xattr_handler posix_acl_default_handler = {
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index e044d5117117..5e90a95ad60b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,57 +9,55 @@
 #define XATTR_SECURITY_PREFIX "security."
 
 static int
-security_get (struct inode *inode, const char *name, void *buffer, size_t size)
+security_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
-    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
 
-    if (is_reiserfs_priv_object(inode))
-        return -EPERM;
+	if (is_reiserfs_priv_object(inode))
+		return -EPERM;
 
-    return reiserfs_xattr_get (inode, name, buffer, size);
+	return reiserfs_xattr_get(inode, name, buffer, size);
 }
 
 static int
-security_set (struct inode *inode, const char *name, const void *buffer,
-          size_t size, int flags)
+security_set(struct inode *inode, const char *name, const void *buffer,
+	     size_t size, int flags)
 {
-    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
 
-    if (is_reiserfs_priv_object(inode))
-        return -EPERM;
+	if (is_reiserfs_priv_object(inode))
+		return -EPERM;
 
-    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int
-security_del (struct inode *inode, const char *name)
+static int security_del(struct inode *inode, const char *name)
 {
-    if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
+		return -EINVAL;
 
-    if (is_reiserfs_priv_object(inode))
-        return -EPERM;
+	if (is_reiserfs_priv_object(inode))
+		return -EPERM;
 
-    return 0;
+	return 0;
 }
 
 static int
-security_list (struct inode *inode, const char *name, int namelen, char *out)
+security_list(struct inode *inode, const char *name, int namelen, char *out)
 {
-    int len = namelen;
+	int len = namelen;
 
-    if (is_reiserfs_priv_object(inode))
-        return 0;
+	if (is_reiserfs_priv_object(inode))
+		return 0;
 
-    if (out)
-        memcpy (out, name, len);
+	if (out)
+		memcpy(out, name, len);
 
-    return len;
+	return len;
 }
 
-
 struct reiserfs_xattr_handler security_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = security_get,
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 43762197fb0a..2501f7e66ab9 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -9,69 +9,67 @@
 #define XATTR_TRUSTED_PREFIX "trusted."
 
 static int
-trusted_get (struct inode *inode, const char *name, void *buffer, size_t size)
+trusted_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
-    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-        return -EPERM;
+	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+		return -EPERM;
 
-    return reiserfs_xattr_get (inode, name, buffer, size);
+	return reiserfs_xattr_get(inode, name, buffer, size);
 }
 
 static int
-trusted_set (struct inode *inode, const char *name, const void *buffer,
-          size_t size, int flags)
+trusted_set(struct inode *inode, const char *name, const void *buffer,
+	    size_t size, int flags)
 {
-    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-        return -EPERM;
+	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+		return -EPERM;
 
-    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int
-trusted_del (struct inode *inode, const char *name)
+static int trusted_del(struct inode *inode, const char *name)
 {
-    if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-        return -EPERM;
+	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+		return -EPERM;
 
-    return 0;
+	return 0;
 }
 
 static int
-trusted_list (struct inode *inode, const char *name, int namelen, char *out)
+trusted_list(struct inode *inode, const char *name, int namelen, char *out)
 {
-    int len = namelen;
+	int len = namelen;
 
-    if (!reiserfs_xattrs (inode->i_sb))
-        return 0;
+	if (!reiserfs_xattrs(inode->i_sb))
+		return 0;
 
-    if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
-        return 0;
+	if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode)))
+		return 0;
 
-    if (out)
-        memcpy (out, name, len);
+	if (out)
+		memcpy(out, name, len);
 
-    return len;
+	return len;
 }
 
-
 struct reiserfs_xattr_handler trusted_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.get = trusted_get,
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 0772806466a8..51458048ca66 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -13,81 +13,80 @@
 #define XATTR_USER_PREFIX "user."
 
 static int
-user_get (struct inode *inode, const char *name, void *buffer, size_t size)
+user_get(struct inode *inode, const char *name, void *buffer, size_t size)
 {
 
-    int error;
+	int error;
 
-    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs_user (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs_user(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    error = reiserfs_permission_locked (inode, MAY_READ, NULL);
-    if (error)
-        return error;
+	error = reiserfs_permission_locked(inode, MAY_READ, NULL);
+	if (error)
+		return error;
 
-    return reiserfs_xattr_get (inode, name, buffer, size);
+	return reiserfs_xattr_get(inode, name, buffer, size);
 }
 
 static int
-user_set (struct inode *inode, const char *name, const void *buffer,
-          size_t size, int flags)
+user_set(struct inode *inode, const char *name, const void *buffer,
+	 size_t size, int flags)
 {
 
-    int error;
+	int error;
 
-    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs_user (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs_user(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    if (!S_ISREG (inode->i_mode) &&
-        (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
-        return -EPERM;
+	if (!S_ISREG(inode->i_mode) &&
+	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
+		return -EPERM;
 
-    error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
-    if (error)
-        return error;
+	error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
+	if (error)
+		return error;
 
-    return reiserfs_xattr_set (inode, name, buffer, size, flags);
+	return reiserfs_xattr_set(inode, name, buffer, size, flags);
 }
 
-static int
-user_del (struct inode *inode, const char *name)
+static int user_del(struct inode *inode, const char *name)
 {
-    int error;
+	int error;
 
-    if (strlen(name) < sizeof(XATTR_USER_PREFIX))
-        return -EINVAL;
+	if (strlen(name) < sizeof(XATTR_USER_PREFIX))
+		return -EINVAL;
 
-    if (!reiserfs_xattrs_user (inode->i_sb))
-        return -EOPNOTSUPP;
+	if (!reiserfs_xattrs_user(inode->i_sb))
+		return -EOPNOTSUPP;
 
-    if (!S_ISREG (inode->i_mode) &&
-        (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX))
-        return -EPERM;
+	if (!S_ISREG(inode->i_mode) &&
+	    (!S_ISDIR(inode->i_mode) || inode->i_mode & S_ISVTX))
+		return -EPERM;
 
-    error = reiserfs_permission_locked (inode, MAY_WRITE, NULL);
-    if (error)
-        return error;
+	error = reiserfs_permission_locked(inode, MAY_WRITE, NULL);
+	if (error)
+		return error;
 
-    return 0;
+	return 0;
 }
 
 static int
-user_list (struct inode *inode, const char *name, int namelen, char *out)
+user_list(struct inode *inode, const char *name, int namelen, char *out)
 {
-    int len = namelen;
-    if (!reiserfs_xattrs_user (inode->i_sb))
-        return 0;
+	int len = namelen;
+	if (!reiserfs_xattrs_user(inode->i_sb))
+		return 0;
 
-    if (out)
-        memcpy (out, name, len);
+	if (out)
+		memcpy(out, name, len);
 
-    return len;
+	return len;
 }
 
 struct reiserfs_xattr_handler user_handler = {
diff --git a/include/linux/reiserfs_acl.h b/include/linux/reiserfs_acl.h
index 0760507a545b..0a3605099c44 100644
--- a/include/linux/reiserfs_acl.h
+++ b/include/linux/reiserfs_acl.h
@@ -4,29 +4,29 @@
 #define REISERFS_ACL_VERSION	0x0001
 
 typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
-	__le32		e_id;
+	__le16 e_tag;
+	__le16 e_perm;
+	__le32 e_id;
 } reiserfs_acl_entry;
 
 typedef struct {
-	__le16		e_tag;
-	__le16		e_perm;
+	__le16 e_tag;
+	__le16 e_perm;
 } reiserfs_acl_entry_short;
 
 typedef struct {
-	__le32		a_version;
+	__le32 a_version;
 } reiserfs_acl_header;
 
 static inline size_t reiserfs_acl_size(int count)
 {
 	if (count <= 4) {
 		return sizeof(reiserfs_acl_header) +
-		       count * sizeof(reiserfs_acl_entry_short);
+		    count * sizeof(reiserfs_acl_entry_short);
 	} else {
 		return sizeof(reiserfs_acl_header) +
-		       4 * sizeof(reiserfs_acl_entry_short) +
-		       (count - 4) * sizeof(reiserfs_acl_entry);
+		    4 * sizeof(reiserfs_acl_entry_short) +
+		    (count - 4) * sizeof(reiserfs_acl_entry);
 	}
 }
 
@@ -46,14 +46,14 @@ static inline int reiserfs_acl_count(size_t size)
 	}
 }
 
-
 #ifdef CONFIG_REISERFS_FS_POSIX_ACL
-struct posix_acl * reiserfs_get_acl(struct inode *inode, int type);
-int reiserfs_acl_chmod (struct inode *inode);
-int reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode);
-int reiserfs_cache_default_acl (struct inode *dir);
-extern int reiserfs_xattr_posix_acl_init (void) __init;
-extern int reiserfs_xattr_posix_acl_exit (void);
+struct posix_acl *reiserfs_get_acl(struct inode *inode, int type);
+int reiserfs_acl_chmod(struct inode *inode);
+int reiserfs_inherit_default_acl(struct inode *dir, struct dentry *dentry,
+				 struct inode *inode);
+int reiserfs_cache_default_acl(struct inode *dir);
+extern int reiserfs_xattr_posix_acl_init(void) __init;
+extern int reiserfs_xattr_posix_acl_exit(void);
 extern struct reiserfs_xattr_handler posix_acl_default_handler;
 extern struct reiserfs_xattr_handler posix_acl_access_handler;
 #else
@@ -61,28 +61,26 @@ extern struct reiserfs_xattr_handler posix_acl_access_handler;
 #define reiserfs_get_acl NULL
 #define reiserfs_cache_default_acl(inode) 0
 
-static inline int
-reiserfs_xattr_posix_acl_init (void)
+static inline int reiserfs_xattr_posix_acl_init(void)
 {
-    return 0;
+	return 0;
 }
 
-static inline int
-reiserfs_xattr_posix_acl_exit (void)
+static inline int reiserfs_xattr_posix_acl_exit(void)
 {
-    return 0;
+	return 0;
 }
 
-static inline int
-reiserfs_acl_chmod (struct inode *inode)
+static inline int reiserfs_acl_chmod(struct inode *inode)
 {
-    return 0;
+	return 0;
 }
 
 static inline int
-reiserfs_inherit_default_acl (const struct inode *dir, struct dentry *dentry, struct inode *inode)
+reiserfs_inherit_default_acl(const struct inode *dir, struct dentry *dentry,
+			     struct inode *inode)
 {
-    return 0;
+	return 0;
 }
 
 #endif
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 4c7c5689ad93..17e458e17e2b 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -3,11 +3,10 @@
  */
 
 				/* this file has an amazingly stupid
-                                   name, yura please fix it to be
-                                   reiserfs.h, and merge all the rest
-                                   of our .h files that are in this
-                                   directory into it.  */
-
+				   name, yura please fix it to be
+				   reiserfs.h, and merge all the rest
+				   of our .h files that are in this
+				   directory into it.  */
 
 #ifndef _LINUX_REISER_FS_H
 #define _LINUX_REISER_FS_H
@@ -74,9 +73,9 @@
 /* debug levels.  Right now, CONFIG_REISERFS_CHECK means print all debug
 ** messages.
 */
-#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ 
+#define REISERFS_DEBUG_CODE 5	/* extra messages to help find/debug errors */
 
-void reiserfs_warning (struct super_block *s, const char * fmt, ...);
+void reiserfs_warning(struct super_block *s, const char *fmt, ...);
 /* assertions handling */
 
 /** always check a condition and panic if it's false. */
@@ -105,82 +104,78 @@ if( !( cond ) ) 								\
  * Structure of super block on disk, a version of which in RAM is often accessed as REISERFS_SB(s)->s_rs
  * the version in RAM is part of a larger structure containing fields never written to disk.
  */
-#define UNSET_HASH 0 // read_super will guess about, what hash names
-                     // in directories were sorted with
+#define UNSET_HASH 0		// read_super will guess about, what hash names
+		     // in directories were sorted with
 #define TEA_HASH  1
 #define YURA_HASH 2
 #define R5_HASH   3
 #define DEFAULT_HASH R5_HASH
 
-
 struct journal_params {
-    __le32 jp_journal_1st_block;	      /* where does journal start from on its
-				       * device */
-    __le32 jp_journal_dev;	      /* journal device st_rdev */
-    __le32 jp_journal_size;	      /* size of the journal */
-    __le32 jp_journal_trans_max;	      /* max number of blocks in a transaction. */
-    __le32 jp_journal_magic; 	      /* random value made on fs creation (this
-				       * was sb_journal_block_count) */
-    __le32 jp_journal_max_batch;	      /* max number of blocks to batch into a
-				       * trans */
-    __le32 jp_journal_max_commit_age;  /* in seconds, how old can an async
-				       * commit be */
-    __le32 jp_journal_max_trans_age;   /* in seconds, how old can a transaction
-				       * be */
+	__le32 jp_journal_1st_block;	/* where does journal start from on its
+					 * device */
+	__le32 jp_journal_dev;	/* journal device st_rdev */
+	__le32 jp_journal_size;	/* size of the journal */
+	__le32 jp_journal_trans_max;	/* max number of blocks in a transaction. */
+	__le32 jp_journal_magic;	/* random value made on fs creation (this
+					 * was sb_journal_block_count) */
+	__le32 jp_journal_max_batch;	/* max number of blocks to batch into a
+					 * trans */
+	__le32 jp_journal_max_commit_age;	/* in seconds, how old can an async
+						 * commit be */
+	__le32 jp_journal_max_trans_age;	/* in seconds, how old can a transaction
+						 * be */
 };
 
 /* this is the super from 3.5.X, where X >= 10 */
-struct reiserfs_super_block_v1
-{
-    __le32 s_block_count;	   /* blocks count         */
-    __le32 s_free_blocks;           /* free blocks count    */
-    __le32 s_root_block;            /* root block number    */
-    struct journal_params s_journal;
-    __le16 s_blocksize;             /* block size */
-    __le16 s_oid_maxsize;	   /* max size of object id array, see
-				    * get_objectid() commentary  */
-    __le16 s_oid_cursize;	   /* current size of object id array */
-    __le16 s_umount_state;          /* this is set to 1 when filesystem was
-				    * umounted, to 2 - when not */    
-    char s_magic[10];              /* reiserfs magic string indicates that
-				    * file system is reiserfs:
-				    * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */
-    __le16 s_fs_state;	           /* it is set to used by fsck to mark which
-				    * phase of rebuilding is done */
-    __le32 s_hash_function_code;    /* indicate, what hash function is being use
-				    * to sort names in a directory*/
-    __le16 s_tree_height;           /* height of disk tree */
-    __le16 s_bmap_nr;               /* amount of bitmap blocks needed to address
-				    * each block of file system */
-    __le16 s_version;               /* this field is only reliable on filesystem
-				    * with non-standard journal */
-    __le16 s_reserved_for_journal;  /* size in blocks of journal area on main
-				    * device, we need to keep after
-				    * making fs with non-standard journal */	
+struct reiserfs_super_block_v1 {
+	__le32 s_block_count;	/* blocks count         */
+	__le32 s_free_blocks;	/* free blocks count    */
+	__le32 s_root_block;	/* root block number    */
+	struct journal_params s_journal;
+	__le16 s_blocksize;	/* block size */
+	__le16 s_oid_maxsize;	/* max size of object id array, see
+				 * get_objectid() commentary  */
+	__le16 s_oid_cursize;	/* current size of object id array */
+	__le16 s_umount_state;	/* this is set to 1 when filesystem was
+				 * umounted, to 2 - when not */
+	char s_magic[10];	/* reiserfs magic string indicates that
+				 * file system is reiserfs:
+				 * "ReIsErFs" or "ReIsEr2Fs" or "ReIsEr3Fs" */
+	__le16 s_fs_state;	/* it is set to used by fsck to mark which
+				 * phase of rebuilding is done */
+	__le32 s_hash_function_code;	/* indicate, what hash function is being use
+					 * to sort names in a directory*/
+	__le16 s_tree_height;	/* height of disk tree */
+	__le16 s_bmap_nr;	/* amount of bitmap blocks needed to address
+				 * each block of file system */
+	__le16 s_version;	/* this field is only reliable on filesystem
+				 * with non-standard journal */
+	__le16 s_reserved_for_journal;	/* size in blocks of journal area on main
+					 * device, we need to keep after
+					 * making fs with non-standard journal */
 } __attribute__ ((__packed__));
 
 #define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1))
 
 /* this is the on disk super block */
-struct reiserfs_super_block
-{
-    struct reiserfs_super_block_v1 s_v1;
-    __le32 s_inode_generation;
-    __le32 s_flags;                  /* Right now used only by inode-attributes, if enabled */
-    unsigned char s_uuid[16];       /* filesystem unique identifier */
-    unsigned char s_label[16];      /* filesystem volume label */
-    char s_unused[88] ;             /* zero filled by mkreiserfs and
-				     * reiserfs_convert_objectid_map_v1()
-				     * so any additions must be updated
-				     * there as well. */
-}  __attribute__ ((__packed__));
+struct reiserfs_super_block {
+	struct reiserfs_super_block_v1 s_v1;
+	__le32 s_inode_generation;
+	__le32 s_flags;		/* Right now used only by inode-attributes, if enabled */
+	unsigned char s_uuid[16];	/* filesystem unique identifier */
+	unsigned char s_label[16];	/* filesystem volume label */
+	char s_unused[88];	/* zero filled by mkreiserfs and
+				 * reiserfs_convert_objectid_map_v1()
+				 * so any additions must be updated
+				 * there as well. */
+} __attribute__ ((__packed__));
 
 #define SB_SIZE (sizeof(struct reiserfs_super_block))
 
 #define REISERFS_VERSION_1 0
 #define REISERFS_VERSION_2 2
 
-
 // on-disk super block fields converted to cpu form
 #define SB_DISK_SUPER_BLOCK(s) (REISERFS_SB(s)->s_rs)
 #define SB_V1_DISK_SUPER_BLOCK(s) (&(SB_DISK_SUPER_BLOCK(s)->s_v1))
@@ -210,13 +205,12 @@ struct reiserfs_super_block
 #define PUT_SB_TREE_HEIGHT(s, val) \
    do { SB_V1_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0)
 #define PUT_SB_REISERFS_STATE(s, val) \
-   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0) 
+   do { SB_V1_DISK_SUPER_BLOCK(s)->s_umount_state = cpu_to_le16(val); } while (0)
 #define PUT_SB_VERSION(s, val) \
    do { SB_V1_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0)
 #define PUT_SB_BMAP_NR(s, val) \
    do { SB_V1_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0)
 
-
 #define SB_ONDISK_JP(s) (&SB_V1_DISK_SUPER_BLOCK(s)->s_journal)
 #define SB_ONDISK_JOURNAL_SIZE(s) \
          le32_to_cpu ((SB_ONDISK_JP(s)->jp_journal_size))
@@ -231,21 +225,19 @@ struct reiserfs_super_block
          block >= SB_JOURNAL_1st_RESERVED_BLOCK(s) \
          && block < SB_JOURNAL_1st_RESERVED_BLOCK(s) +  \
          ((!is_reiserfs_jr(SB_DISK_SUPER_BLOCK(s)) ? \
-         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s))) 
-
-
+         SB_ONDISK_JOURNAL_SIZE(s) + 1 : SB_ONDISK_RESERVED_FOR_JOURNAL(s)))
 
 				/* used by gcc */
 #define REISERFS_SUPER_MAGIC 0x52654973
 				/* used by file system utilities that
-                                   look at the superblock, etc. */
+				   look at the superblock, etc. */
 #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs"
 #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs"
 #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs"
 
-int is_reiserfs_3_5 (struct reiserfs_super_block * rs);
-int is_reiserfs_3_6 (struct reiserfs_super_block * rs);
-int is_reiserfs_jr (struct reiserfs_super_block * rs);
+int is_reiserfs_3_5(struct reiserfs_super_block *rs);
+int is_reiserfs_3_6(struct reiserfs_super_block *rs);
+int is_reiserfs_jr(struct reiserfs_super_block *rs);
 
 /* ReiserFS leaves the first 64k unused, so that partition labels have
    enough space.  If someone wants to write a fancy bootloader that
@@ -272,8 +264,8 @@ typedef __u32 b_blocknr_t;
 typedef __le32 unp_t;
 
 struct unfm_nodeinfo {
-    unp_t unfm_nodenum;
-    unsigned short unfm_freespace;
+	unp_t unfm_nodenum;
+	unsigned short unfm_freespace;
 };
 
 /* there are two formats of keys: 3.5 and 3.6
@@ -285,7 +277,6 @@ struct unfm_nodeinfo {
 #define STAT_DATA_V1 0
 #define STAT_DATA_V2 1
 
-
 static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode)
 {
 	return container_of(inode, struct reiserfs_inode_info, vfs_inode);
@@ -343,15 +334,13 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
    file would fit into one DIRECT item.
    Primary intention for this one is to increase performance by decreasing
    seeking.
-*/   
+*/
 #define STORE_TAIL_IN_UNFM_S2(n_file_size,n_tail_size,n_block_size) \
 (\
   (!(n_tail_size)) || \
   (((n_file_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) ) \
 )
 
-
-
 /*
  * values for s_umount_state field
  */
@@ -364,9 +353,9 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
 #define TYPE_STAT_DATA 0
 #define TYPE_INDIRECT 1
 #define TYPE_DIRECT 2
-#define TYPE_DIRENTRY 3 
-#define TYPE_MAXTYPE 3 
-#define TYPE_ANY 15 // FIXME: comment is required
+#define TYPE_DIRENTRY 3
+#define TYPE_MAXTYPE 3
+#define TYPE_ANY 15		// FIXME: comment is required
 
 /***************************************************************************/
 /*                       KEY & ITEM HEAD                                   */
@@ -376,60 +365,62 @@ static inline struct reiserfs_sb_info *REISERFS_SB(const struct super_block *sb)
 // directories use this key as well as old files
 //
 struct offset_v1 {
-    __le32 k_offset;
-    __le32 k_uniqueness;
+	__le32 k_offset;
+	__le32 k_uniqueness;
 } __attribute__ ((__packed__));
 
 struct offset_v2 {
 	__le64 v;
 } __attribute__ ((__packed__));
 
-static inline __u16 offset_v2_k_type( const struct offset_v2 *v2 )
+static inline __u16 offset_v2_k_type(const struct offset_v2 *v2)
 {
 	__u8 type = le64_to_cpu(v2->v) >> 60;
-	return (type <= TYPE_MAXTYPE)?type:TYPE_ANY;
+	return (type <= TYPE_MAXTYPE) ? type : TYPE_ANY;
 }
- 
-static inline void set_offset_v2_k_type( struct offset_v2 *v2, int type )
+
+static inline void set_offset_v2_k_type(struct offset_v2 *v2, int type)
 {
-	v2->v = (v2->v & cpu_to_le64(~0ULL>>4)) | cpu_to_le64((__u64)type<<60);
+	v2->v =
+	    (v2->v & cpu_to_le64(~0ULL >> 4)) | cpu_to_le64((__u64) type << 60);
 }
- 
-static inline loff_t offset_v2_k_offset( const struct offset_v2 *v2 )
+
+static inline loff_t offset_v2_k_offset(const struct offset_v2 *v2)
 {
-	return le64_to_cpu(v2->v) & (~0ULL>>4);
+	return le64_to_cpu(v2->v) & (~0ULL >> 4);
 }
 
-static inline void set_offset_v2_k_offset( struct offset_v2 *v2, loff_t offset ){
-	offset &= (~0ULL>>4);
-	v2->v = (v2->v & cpu_to_le64(15ULL<<60)) | cpu_to_le64(offset);
+static inline void set_offset_v2_k_offset(struct offset_v2 *v2, loff_t offset)
+{
+	offset &= (~0ULL >> 4);
+	v2->v = (v2->v & cpu_to_le64(15ULL << 60)) | cpu_to_le64(offset);
 }
 
 /* Key of an item determines its location in the S+tree, and
    is composed of 4 components */
 struct reiserfs_key {
-    __le32 k_dir_id;    /* packing locality: by default parent
-			  directory object id */
-    __le32 k_objectid;  /* object identifier */
-    union {
-	struct offset_v1 k_offset_v1;
-	struct offset_v2 k_offset_v2;
-    } __attribute__ ((__packed__)) u;
+	__le32 k_dir_id;	/* packing locality: by default parent
+				   directory object id */
+	__le32 k_objectid;	/* object identifier */
+	union {
+		struct offset_v1 k_offset_v1;
+		struct offset_v2 k_offset_v2;
+	} __attribute__ ((__packed__)) u;
 } __attribute__ ((__packed__));
 
 struct in_core_key {
-    __u32 k_dir_id;    /* packing locality: by default parent
-			  directory object id */
-    __u32 k_objectid;  /* object identifier */
-    __u64 k_offset;
-    __u8 k_type;
+	__u32 k_dir_id;		/* packing locality: by default parent
+				   directory object id */
+	__u32 k_objectid;	/* object identifier */
+	__u64 k_offset;
+	__u8 k_type;
 };
 
 struct cpu_key {
-    struct in_core_key on_disk_key;
-    int version;
-    int key_length; /* 3 in all cases but direct2indirect and
-		       indirect2direct conversion */
+	struct in_core_key on_disk_key;
+	int version;
+	int key_length;		/* 3 in all cases but direct2indirect and
+				   indirect2direct conversion */
 };
 
 /* Our function for comparing keys can compare keys of different
@@ -475,8 +466,7 @@ struct cpu_key {
     indirect items) and specifies the location of the item itself
     within the block.  */
 
-struct item_head
-{
+struct item_head {
 	/* Everything in the tree is found by searching for it based on
 	 * its key.*/
 	struct reiserfs_key ih_key;
@@ -492,13 +482,13 @@ struct item_head
 		   number of directory entries in the directory item. */
 		__le16 ih_entry_count;
 	} __attribute__ ((__packed__)) u;
-	__le16 ih_item_len;           /* total size of the item body */
-	__le16 ih_item_location;      /* an offset to the item body
-				      * within the block */
-	__le16 ih_version;	     /* 0 for all old items, 2 for new
-					ones. Highest bit is set by fsck
-					temporary, cleaned after all
-					done */
+	__le16 ih_item_len;	/* total size of the item body */
+	__le16 ih_item_location;	/* an offset to the item body
+					 * within the block */
+	__le16 ih_version;	/* 0 for all old items, 2 for new
+				   ones. Highest bit is set by fsck
+				   temporary, cleaned after all
+				   done */
 } __attribute__ ((__packed__));
 /* size of item header     */
 #define IH_SIZE (sizeof(struct item_head))
@@ -515,7 +505,6 @@ struct item_head
 #define put_ih_location(ih, val)     do { (ih)->ih_item_location = cpu_to_le16(val); } while (0)
 #define put_ih_item_len(ih, val)     do { (ih)->ih_item_len = cpu_to_le16(val); } while (0)
 
-
 #define unreachable_item(ih) (ih_version(ih) & (1 << 15))
 
 #define get_ih_free_space(ih) (ih_version (ih) == KEY_FORMAT_3_6 ? 0 : ih_free_space (ih))
@@ -537,40 +526,48 @@ struct item_head
 #define V1_INDIRECT_UNIQUENESS 0xfffffffe
 #define V1_DIRECT_UNIQUENESS 0xffffffff
 #define V1_DIRENTRY_UNIQUENESS 500
-#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required
+#define V1_ANY_UNIQUENESS 555	// FIXME: comment is required
 
 //
 // here are conversion routines
 //
-static inline int uniqueness2type (__u32 uniqueness) CONSTF;
-static inline int uniqueness2type (__u32 uniqueness)
+static inline int uniqueness2type(__u32 uniqueness) CONSTF;
+static inline int uniqueness2type(__u32 uniqueness)
 {
-    switch ((int)uniqueness) {
-    case V1_SD_UNIQUENESS: return TYPE_STAT_DATA;
-    case V1_INDIRECT_UNIQUENESS: return TYPE_INDIRECT;
-    case V1_DIRECT_UNIQUENESS: return TYPE_DIRECT;
-    case V1_DIRENTRY_UNIQUENESS: return TYPE_DIRENTRY;
-    default:
-	    reiserfs_warning (NULL, "vs-500: unknown uniqueness %d",
-			      uniqueness);
+	switch ((int)uniqueness) {
+	case V1_SD_UNIQUENESS:
+		return TYPE_STAT_DATA;
+	case V1_INDIRECT_UNIQUENESS:
+		return TYPE_INDIRECT;
+	case V1_DIRECT_UNIQUENESS:
+		return TYPE_DIRECT;
+	case V1_DIRENTRY_UNIQUENESS:
+		return TYPE_DIRENTRY;
+	default:
+		reiserfs_warning(NULL, "vs-500: unknown uniqueness %d",
+				 uniqueness);
 	case V1_ANY_UNIQUENESS:
-	    return TYPE_ANY;
-    }
+		return TYPE_ANY;
+	}
 }
 
-static inline __u32 type2uniqueness (int type) CONSTF;
-static inline __u32 type2uniqueness (int type)
+static inline __u32 type2uniqueness(int type) CONSTF;
+static inline __u32 type2uniqueness(int type)
 {
-    switch (type) {
-    case TYPE_STAT_DATA: return V1_SD_UNIQUENESS;
-    case TYPE_INDIRECT: return V1_INDIRECT_UNIQUENESS;
-    case TYPE_DIRECT: return V1_DIRECT_UNIQUENESS;
-    case TYPE_DIRENTRY: return V1_DIRENTRY_UNIQUENESS;
-    default:
-	    reiserfs_warning (NULL, "vs-501: unknown type %d", type);
+	switch (type) {
+	case TYPE_STAT_DATA:
+		return V1_SD_UNIQUENESS;
+	case TYPE_INDIRECT:
+		return V1_INDIRECT_UNIQUENESS;
+	case TYPE_DIRECT:
+		return V1_DIRECT_UNIQUENESS;
+	case TYPE_DIRENTRY:
+		return V1_DIRENTRY_UNIQUENESS;
+	default:
+		reiserfs_warning(NULL, "vs-501: unknown type %d", type);
 	case TYPE_ANY:
-	    return V1_ANY_UNIQUENESS;
-    }
+		return V1_ANY_UNIQUENESS;
+	}
 }
 
 //
@@ -578,57 +575,56 @@ static inline __u32 type2uniqueness (int type)
 // there is no way to get version of object from key, so, provide
 // version to these defines
 //
-static inline loff_t le_key_k_offset (int version, const struct reiserfs_key * key)
+static inline loff_t le_key_k_offset(int version,
+				     const struct reiserfs_key *key)
 {
-    return (version == KEY_FORMAT_3_5) ?
-        le32_to_cpu( key->u.k_offset_v1.k_offset ) :
-	offset_v2_k_offset( &(key->u.k_offset_v2) );
+	return (version == KEY_FORMAT_3_5) ?
+	    le32_to_cpu(key->u.k_offset_v1.k_offset) :
+	    offset_v2_k_offset(&(key->u.k_offset_v2));
 }
 
-static inline loff_t le_ih_k_offset (const struct item_head * ih)
+static inline loff_t le_ih_k_offset(const struct item_head *ih)
 {
-    return le_key_k_offset (ih_version (ih), &(ih->ih_key));
+	return le_key_k_offset(ih_version(ih), &(ih->ih_key));
 }
 
-static inline loff_t le_key_k_type (int version, const struct reiserfs_key * key)
+static inline loff_t le_key_k_type(int version, const struct reiserfs_key *key)
 {
-    return (version == KEY_FORMAT_3_5) ?
-        uniqueness2type( le32_to_cpu( key->u.k_offset_v1.k_uniqueness)) :
-	offset_v2_k_type( &(key->u.k_offset_v2) );
+	return (version == KEY_FORMAT_3_5) ?
+	    uniqueness2type(le32_to_cpu(key->u.k_offset_v1.k_uniqueness)) :
+	    offset_v2_k_type(&(key->u.k_offset_v2));
 }
 
-static inline loff_t le_ih_k_type (const struct item_head * ih)
+static inline loff_t le_ih_k_type(const struct item_head *ih)
 {
-    return le_key_k_type (ih_version (ih), &(ih->ih_key));
+	return le_key_k_type(ih_version(ih), &(ih->ih_key));
 }
 
-
-static inline void set_le_key_k_offset (int version, struct reiserfs_key * key, loff_t offset)
+static inline void set_le_key_k_offset(int version, struct reiserfs_key *key,
+				       loff_t offset)
 {
-    (version == KEY_FORMAT_3_5) ?
-        (void)(key->u.k_offset_v1.k_offset = cpu_to_le32 (offset)) : /* jdm check */
-	(void)(set_offset_v2_k_offset( &(key->u.k_offset_v2), offset ));
+	(version == KEY_FORMAT_3_5) ? (void)(key->u.k_offset_v1.k_offset = cpu_to_le32(offset)) :	/* jdm check */
+	    (void)(set_offset_v2_k_offset(&(key->u.k_offset_v2), offset));
 }
 
-
-static inline void set_le_ih_k_offset (struct item_head * ih, loff_t offset)
+static inline void set_le_ih_k_offset(struct item_head *ih, loff_t offset)
 {
-    set_le_key_k_offset (ih_version (ih), &(ih->ih_key), offset);
+	set_le_key_k_offset(ih_version(ih), &(ih->ih_key), offset);
 }
 
-
-static inline void set_le_key_k_type (int version, struct reiserfs_key * key, int type)
+static inline void set_le_key_k_type(int version, struct reiserfs_key *key,
+				     int type)
 {
-    (version == KEY_FORMAT_3_5) ?
-        (void)(key->u.k_offset_v1.k_uniqueness = cpu_to_le32(type2uniqueness(type))):
-	(void)(set_offset_v2_k_type( &(key->u.k_offset_v2), type ));
+	(version == KEY_FORMAT_3_5) ?
+	    (void)(key->u.k_offset_v1.k_uniqueness =
+		   cpu_to_le32(type2uniqueness(type)))
+	    : (void)(set_offset_v2_k_type(&(key->u.k_offset_v2), type));
 }
-static inline void set_le_ih_k_type (struct item_head * ih, int type)
+static inline void set_le_ih_k_type(struct item_head *ih, int type)
 {
-    set_le_key_k_type (ih_version (ih), &(ih->ih_key), type);
+	set_le_key_k_type(ih_version(ih), &(ih->ih_key), type);
 }
 
-
 #define is_direntry_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRENTRY)
 #define is_direct_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRECT)
 #define is_indirect_le_key(version,key) (le_key_k_type (version, key) == TYPE_INDIRECT)
@@ -642,34 +638,32 @@ static inline void set_le_ih_k_type (struct item_head * ih, int type)
 #define is_indirect_le_ih(ih) is_indirect_le_key (ih_version(ih), &((ih)->ih_key))
 #define is_statdata_le_ih(ih) is_statdata_le_key (ih_version (ih), &((ih)->ih_key))
 
-
-
 //
 // key is pointer to cpu key, result is cpu
 //
-static inline loff_t cpu_key_k_offset (const struct cpu_key * key)
+static inline loff_t cpu_key_k_offset(const struct cpu_key *key)
 {
-    return key->on_disk_key.k_offset;
+	return key->on_disk_key.k_offset;
 }
 
-static inline loff_t cpu_key_k_type (const struct cpu_key * key)
+static inline loff_t cpu_key_k_type(const struct cpu_key *key)
 {
-    return key->on_disk_key.k_type;
+	return key->on_disk_key.k_type;
 }
 
-static inline void set_cpu_key_k_offset (struct cpu_key * key, loff_t offset)
+static inline void set_cpu_key_k_offset(struct cpu_key *key, loff_t offset)
 {
 	key->on_disk_key.k_offset = offset;
 }
 
-static inline void set_cpu_key_k_type (struct cpu_key * key, int type)
+static inline void set_cpu_key_k_type(struct cpu_key *key, int type)
 {
 	key->on_disk_key.k_type = type;
 }
 
-static inline void cpu_key_k_offset_dec (struct cpu_key * key)
+static inline void cpu_key_k_offset_dec(struct cpu_key *key)
 {
-	key->on_disk_key.k_offset --;
+	key->on_disk_key.k_offset--;
 }
 
 #define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY)
@@ -677,34 +671,25 @@ static inline void cpu_key_k_offset_dec (struct cpu_key * key)
 #define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT)
 #define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA)
 
-
 /* are these used ? */
 #define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key)))
 #define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key)))
 #define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key)))
 #define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key)))
 
-
-
-
-
 #define I_K_KEY_IN_ITEM(p_s_ih, p_s_key, n_blocksize) \
     ( ! COMP_SHORT_KEYS(p_s_ih, p_s_key) && \
           I_OFF_BYTE_IN_ITEM(p_s_ih, k_offset (p_s_key), n_blocksize) )
 
-/* maximal length of item */ 
+/* maximal length of item */
 #define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE)
 #define MIN_ITEM_LEN 1
 
-
 /* object identifier for root dir */
 #define REISERFS_ROOT_OBJECTID 2
 #define REISERFS_ROOT_PARENT_OBJECTID 1
 extern struct reiserfs_key root_key;
 
-
-
-
 /* 
  * Picture represents a leaf of the S+tree
  *  ______________________________________________________
@@ -716,13 +701,13 @@ extern struct reiserfs_key root_key;
 
 /* Header of a disk block.  More precisely, header of a formatted leaf
    or internal node, and not the header of an unformatted node. */
-struct block_head {       
-  __le16 blk_level;        /* Level of a block in the tree. */
-  __le16 blk_nr_item;      /* Number of keys/items in a block. */
-  __le16 blk_free_space;   /* Block free space in bytes. */
-  __le16 blk_reserved;
-				/* dump this in v4/planA */
-  struct reiserfs_key  blk_right_delim_key; /* kept only for compatibility */
+struct block_head {
+	__le16 blk_level;	/* Level of a block in the tree. */
+	__le16 blk_nr_item;	/* Number of keys/items in a block. */
+	__le16 blk_free_space;	/* Block free space in bytes. */
+	__le16 blk_reserved;
+	/* dump this in v4/planA */
+	struct reiserfs_key blk_right_delim_key;	/* kept only for compatibility */
 };
 
 #define BLKH_SIZE                     (sizeof(struct block_head))
@@ -741,12 +726,12 @@ struct block_head {
  * values for blk_level field of the struct block_head
  */
 
-#define FREE_LEVEL 0 /* when node gets removed from the tree its
-			blk_level is set to FREE_LEVEL. It is then
-			used to see whether the node is still in the
-			tree */
+#define FREE_LEVEL 0		/* when node gets removed from the tree its
+				   blk_level is set to FREE_LEVEL. It is then
+				   used to see whether the node is still in the
+				   tree */
 
-#define DISK_LEAF_NODE_LEVEL  1 /* Leaf node level.*/
+#define DISK_LEAF_NODE_LEVEL  1	/* Leaf node level. */
 
 /* Given the buffer head of a formatted node, resolve to the block head of that node. */
 #define B_BLK_HEAD(p_s_bh)            ((struct block_head *)((p_s_bh)->b_data))
@@ -759,7 +744,6 @@ struct block_head {
 #define PUT_B_LEVEL(p_s_bh,val)       do { set_blkh_level(B_BLK_HEAD(p_s_bh),val); } while (0)
 #define PUT_B_FREE_SPACE(p_s_bh,val)  do { set_blkh_free_space(B_BLK_HEAD(p_s_bh),val); } while (0)
 
-
 /* Get right delimiting key. -- little endian */
 #define B_PRIGHT_DELIM_KEY(p_s_bh)   (&(blk_right_delim_key(B_BLK_HEAD(p_s_bh))
 
@@ -770,41 +754,36 @@ struct block_head {
 #define B_IS_KEYS_LEVEL(p_s_bh)      (B_LEVEL(p_s_bh) > DISK_LEAF_NODE_LEVEL \
                                             && B_LEVEL(p_s_bh) <= MAX_HEIGHT)
 
-
-
-
 /***************************************************************************/
 /*                             STAT DATA                                   */
 /***************************************************************************/
 
-
 //
 // old stat data is 32 bytes long. We are going to distinguish new one by
 // different size
 //
-struct stat_data_v1
-{
-    __le16 sd_mode;	/* file type, permissions */
-    __le16 sd_nlink;	/* number of hard links */
-    __le16 sd_uid;		/* owner */
-    __le16 sd_gid;		/* group */
-    __le32 sd_size;	/* file size */
-    __le32 sd_atime;	/* time of last access */
-    __le32 sd_mtime;	/* time file was last modified  */
-    __le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
-    union {
-	__le32 sd_rdev;
-	__le32 sd_blocks;	/* number of blocks file uses */
-    } __attribute__ ((__packed__)) u;
-    __le32 sd_first_direct_byte; /* first byte of file which is stored
-				   in a direct item: except that if it
-				   equals 1 it is a symlink and if it
-				   equals ~(__u32)0 there is no
-				   direct item.  The existence of this
-				   field really grates on me. Let's
-				   replace it with a macro based on
-				   sd_size and our tail suppression
-				   policy.  Someday.  -Hans */
+struct stat_data_v1 {
+	__le16 sd_mode;		/* file type, permissions */
+	__le16 sd_nlink;	/* number of hard links */
+	__le16 sd_uid;		/* owner */
+	__le16 sd_gid;		/* group */
+	__le32 sd_size;		/* file size */
+	__le32 sd_atime;	/* time of last access */
+	__le32 sd_mtime;	/* time file was last modified  */
+	__le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+	union {
+		__le32 sd_rdev;
+		__le32 sd_blocks;	/* number of blocks file uses */
+	} __attribute__ ((__packed__)) u;
+	__le32 sd_first_direct_byte;	/* first byte of file which is stored
+					   in a direct item: except that if it
+					   equals 1 it is a symlink and if it
+					   equals ~(__u32)0 there is no
+					   direct item.  The existence of this
+					   field really grates on me. Let's
+					   replace it with a macro based on
+					   sd_size and our tail suppression
+					   policy.  Someday.  -Hans */
 } __attribute__ ((__packed__));
 
 #define SD_V1_SIZE              (sizeof(struct stat_data_v1))
@@ -862,29 +841,29 @@ struct stat_data_v1
 /* Stat Data on disk (reiserfs version of UFS disk inode minus the
    address blocks) */
 struct stat_data {
-    __le16 sd_mode;	/* file type, permissions */
-    __le16 sd_attrs;     /* persistent inode flags */
-    __le32 sd_nlink;	/* number of hard links */
-    __le64 sd_size;	/* file size */
-    __le32 sd_uid;		/* owner */
-    __le32 sd_gid;		/* group */
-    __le32 sd_atime;	/* time of last access */
-    __le32 sd_mtime;	/* time file was last modified  */
-    __le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
-    __le32 sd_blocks;
-    union {
-	__le32 sd_rdev;
-	__le32 sd_generation;
-      //__le32 sd_first_direct_byte;
-      /* first byte of file which is stored in a
-				       direct item: except that if it equals 1
-				       it is a symlink and if it equals
-				       ~(__u32)0 there is no direct item.  The
-				       existence of this field really grates
-				       on me. Let's replace it with a macro
-				       based on sd_size and our tail
-				       suppression policy? */
-  } __attribute__ ((__packed__)) u;
+	__le16 sd_mode;		/* file type, permissions */
+	__le16 sd_attrs;	/* persistent inode flags */
+	__le32 sd_nlink;	/* number of hard links */
+	__le64 sd_size;		/* file size */
+	__le32 sd_uid;		/* owner */
+	__le32 sd_gid;		/* group */
+	__le32 sd_atime;	/* time of last access */
+	__le32 sd_mtime;	/* time file was last modified  */
+	__le32 sd_ctime;	/* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */
+	__le32 sd_blocks;
+	union {
+		__le32 sd_rdev;
+		__le32 sd_generation;
+		//__le32 sd_first_direct_byte;
+		/* first byte of file which is stored in a
+		   direct item: except that if it equals 1
+		   it is a symlink and if it equals
+		   ~(__u32)0 there is no direct item.  The
+		   existence of this field really grates
+		   on me. Let's replace it with a macro
+		   based on sd_size and our tail
+		   suppression policy? */
+	} __attribute__ ((__packed__)) u;
 } __attribute__ ((__packed__));
 //
 // this is 44 bytes long
@@ -919,7 +898,6 @@ struct stat_data {
 #define sd_v2_attrs(sdp)         (le16_to_cpu((sdp)->sd_attrs))
 #define set_sd_v2_attrs(sdp,v)   ((sdp)->sd_attrs = cpu_to_le16(v))
 
-
 /***************************************************************************/
 /*                      DIRECTORY STRUCTURE                                */
 /***************************************************************************/
@@ -954,17 +932,14 @@ struct stat_data {
 /* NOT IMPLEMENTED:   
    Directory will someday contain stat data of object */
 
-
-
-struct reiserfs_de_head
-{
-  __le32 deh_offset;		/* third component of the directory entry key */
-  __le32 deh_dir_id;		/* objectid of the parent directory of the object, that is referenced
-					   by directory entry */
-  __le32 deh_objectid;		/* objectid of the object, that is referenced by directory entry */
-  __le16 deh_location;		/* offset of name in the whole item */
-  __le16 deh_state;		/* whether 1) entry contains stat data (for future), and 2) whether
-					   entry is hidden (unlinked) */
+struct reiserfs_de_head {
+	__le32 deh_offset;	/* third component of the directory entry key */
+	__le32 deh_dir_id;	/* objectid of the parent directory of the object, that is referenced
+				   by directory entry */
+	__le32 deh_objectid;	/* objectid of the object, that is referenced by directory entry */
+	__le16 deh_location;	/* offset of name in the whole item */
+	__le16 deh_state;	/* whether 1) entry contains stat data (for future), and 2) whether
+				   entry is hidden (unlinked) */
 } __attribute__ ((__packed__));
 #define DEH_SIZE                  sizeof(struct reiserfs_de_head)
 #define deh_offset(p_deh)         (le32_to_cpu((p_deh)->deh_offset))
@@ -986,7 +961,7 @@ struct reiserfs_de_head
 /* old format directories have this size when empty */
 #define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3)
 
-#define DEH_Statdata 0			/* not used now */
+#define DEH_Statdata 0		/* not used now */
 #define DEH_Visible 2
 
 /* 64 bit systems (and the S/390) need to be aligned explicitly -jdm */
@@ -1023,10 +998,10 @@ struct reiserfs_de_head
 #define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 #define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 
-extern void make_empty_dir_item_v1 (char * body, __le32 dirid, __le32 objid,
-				    __le32 par_dirid, __le32 par_objid);
-extern void make_empty_dir_item (char * body, __le32 dirid, __le32 objid,
-				 __le32 par_dirid, __le32 par_objid);
+extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
+				   __le32 par_dirid, __le32 par_objid);
+extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
+				__le32 par_dirid, __le32 par_objid);
 
 /* array of the entry headers */
  /* get item body */
@@ -1043,53 +1018,48 @@ extern void make_empty_dir_item (char * body, __le32 dirid, __le32 objid,
 #define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \
 ((i) ? (deh_location((deh)-1) - deh_location((deh))) : (ih_item_len((ih)) - deh_location((deh))))
 */
-static inline int entry_length (const struct buffer_head * bh, 
-								const struct item_head * ih, int pos_in_item)
+static inline int entry_length(const struct buffer_head *bh,
+			       const struct item_head *ih, int pos_in_item)
 {
-    struct reiserfs_de_head * deh;
+	struct reiserfs_de_head *deh;
 
-    deh = B_I_DEH (bh, ih) + pos_in_item;
-    if (pos_in_item)
-	return deh_location(deh-1) - deh_location(deh);
+	deh = B_I_DEH(bh, ih) + pos_in_item;
+	if (pos_in_item)
+		return deh_location(deh - 1) - deh_location(deh);
 
-    return ih_item_len(ih) - deh_location(deh);
+	return ih_item_len(ih) - deh_location(deh);
 }
 
-
-
 /* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */
 #define I_ENTRY_COUNT(ih) (ih_entry_count((ih)))
 
-
 /* name by bh, ih and entry_num */
 #define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih_location(ih) + deh_location(B_I_DEH(bh,ih)+(entry_num))))
 
 // two entries per block (at least)
 #define REISERFS_MAX_NAME(block_size) 255
 
-
 /* this structure is used for operations on directory entries. It is
    not a disk structure. */
 /* When reiserfs_find_entry or search_by_entry_key find directory
    entry, they return filled reiserfs_dir_entry structure */
-struct reiserfs_dir_entry
-{
-  struct buffer_head * de_bh;
-  int de_item_num;
-  struct item_head * de_ih;
-  int de_entry_num;
-  struct reiserfs_de_head * de_deh;
-  int de_entrylen;
-  int de_namelen;
-  char * de_name;
-  char * de_gen_number_bit_string;
-
-  __u32 de_dir_id;
-  __u32 de_objectid;
-
-  struct cpu_key de_entry_key;
+struct reiserfs_dir_entry {
+	struct buffer_head *de_bh;
+	int de_item_num;
+	struct item_head *de_ih;
+	int de_entry_num;
+	struct reiserfs_de_head *de_deh;
+	int de_entrylen;
+	int de_namelen;
+	char *de_name;
+	char *de_gen_number_bit_string;
+
+	__u32 de_dir_id;
+	__u32 de_objectid;
+
+	struct cpu_key de_entry_key;
 };
-   
+
 /* these defines are useful when a particular member of a reiserfs_dir_entry is needed */
 
 /* pointer to file name, stored in entry */
@@ -1099,8 +1069,6 @@ struct reiserfs_dir_entry
 #define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \
 (I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0))
 
-
-
 /* hash value occupies bits from 7 up to 30 */
 #define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL)
 /* generation number occupies 7 bits starting from 0 up to 6 */
@@ -1109,7 +1077,6 @@ struct reiserfs_dir_entry
 
 #define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number))
 
-
 /*
  * Picture represents an internal node of the reiserfs tree
  *  ______________________________________________________
@@ -1125,9 +1092,9 @@ struct reiserfs_dir_entry
 /* Disk child pointer: The pointer from an internal node of the tree
    to a node that is on disk. */
 struct disk_child {
-  __le32       dc_block_number;              /* Disk child's block number. */
-  __le16       dc_size;		            /* Disk child's used space.   */
-  __le16       dc_reserved;
+	__le32 dc_block_number;	/* Disk child's block number. */
+	__le16 dc_size;		/* Disk child's used space.   */
+	__le16 dc_reserved;
 };
 
 #define DC_SIZE (sizeof(struct disk_child))
@@ -1144,7 +1111,7 @@ struct disk_child {
 #define B_N_CHILD_NUM(p_s_bh,n_pos) (dc_block_number(B_N_CHILD(p_s_bh,n_pos)))
 #define PUT_B_N_CHILD_NUM(p_s_bh,n_pos, val) (put_dc_block_number(B_N_CHILD(p_s_bh,n_pos), val ))
 
- /* maximal value of field child_size in structure disk_child */ 
+ /* maximal value of field child_size in structure disk_child */
  /* child size is the combined size of all items and their headers */
 #define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE ))
 
@@ -1159,7 +1126,6 @@ struct disk_child {
 /*                      PATH STRUCTURES AND DEFINES                        */
 /***************************************************************************/
 
-
 /* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the
    key.  It uses reiserfs_bread to try to find buffers in the cache given their block number.  If it
    does not find them in the cache it reads them from disk.  For each node search_by_key finds using
@@ -1168,20 +1134,18 @@ struct disk_child {
    is looking through a leaf node bin_search will find the position of the item which has key either
    equal to given key, or which is the maximal key less than the given key. */
 
-struct  path_element  {
-  struct buffer_head *	pe_buffer;    /* Pointer to the buffer at the path in the tree. */
-  int         		pe_position;  /* Position in the tree node which is placed in the */
-                                      /* buffer above.                                  */
+struct path_element {
+	struct buffer_head *pe_buffer;	/* Pointer to the buffer at the path in the tree. */
+	int pe_position;	/* Position in the tree node which is placed in the */
+	/* buffer above.                                  */
 };
 
-#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */
-#define EXTENDED_MAX_HEIGHT         7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
-#define FIRST_PATH_ELEMENT_OFFSET   2 /* Must be equal to at least 2. */
-
-#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
-#define MAX_FEB_SIZE 6   /* this MUST be MAX_HEIGHT + 1. See about FEB below */
-
+#define MAX_HEIGHT 5		/* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */
+#define EXTENDED_MAX_HEIGHT         7	/* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */
+#define FIRST_PATH_ELEMENT_OFFSET   2	/* Must be equal to at least 2. */
 
+#define ILLEGAL_PATH_ELEMENT_OFFSET 1	/* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */
+#define MAX_FEB_SIZE 6		/* this MUST be MAX_HEIGHT + 1. See about FEB below */
 
 /* We need to keep track of who the ancestors of nodes are.  When we
    perform a search we record which nodes were visited while
@@ -1200,14 +1164,14 @@ excessive effort to avoid disturbing the precious VFS code.:-( The
 gods only know how we are going to SMP the code that uses them.
 znodes are the way! */
 
-#define PATH_READA	0x1 /* do read ahead */
-#define PATH_READA_BACK 0x2 /* read backwards */
+#define PATH_READA	0x1	/* do read ahead */
+#define PATH_READA_BACK 0x2	/* read backwards */
 
-struct  path {
-  int                   path_length;                      	/* Length of the array above.   */
-  int			reada;
-  struct  path_element  path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */
-  int			pos_in_item;
+struct path {
+	int path_length;	/* Length of the array above.   */
+	int reada;
+	struct path_element path_elements[EXTENDED_MAX_HEIGHT];	/* Array of the path elements.  */
+	int pos_in_item;
 };
 
 #define pos_in_item(path) ((path)->pos_in_item)
@@ -1224,25 +1188,23 @@ struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 /* Get position in the element at the path by path and path position. */
 #define PATH_OFFSET_POSITION(p_s_path,n_offset) (PATH_OFFSET_PELEMENT(p_s_path,n_offset)->pe_position)
 
-
 #define PATH_PLAST_BUFFER(p_s_path) (PATH_OFFSET_PBUFFER((p_s_path), (p_s_path)->path_length))
 				/* you know, to the person who didn't
-                                   write this the macro name does not
-                                   at first suggest what it does.
-                                   Maybe POSITION_FROM_PATH_END? Or
-                                   maybe we should just focus on
-                                   dumping paths... -Hans */
+				   write this the macro name does not
+				   at first suggest what it does.
+				   Maybe POSITION_FROM_PATH_END? Or
+				   maybe we should just focus on
+				   dumping paths... -Hans */
 #define PATH_LAST_POSITION(p_s_path) (PATH_OFFSET_POSITION((p_s_path), (p_s_path)->path_length))
 
-
 #define PATH_PITEM_HEAD(p_s_path)    B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_path),PATH_LAST_POSITION(p_s_path))
 
 /* in do_balance leaf has h == 0 in contrast with path structure,
    where root has level == 0. That is why we need these defines */
 #define PATH_H_PBUFFER(p_s_path, h) PATH_OFFSET_PBUFFER (p_s_path, p_s_path->path_length - (h))	/* tb->S[h] */
-#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1)			/* tb->F[h] or tb->S[0]->b_parent */
-#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))	
-#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)		/* tb->S[h]->b_item_order */
+#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1)	/* tb->F[h] or tb->S[0]->b_parent */
+#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h))
+#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1)	/* tb->S[h]->b_item_order */
 
 #define PATH_H_PATH_OFFSET(p_s_path, n_h) ((p_s_path)->path_length - (n_h))
 
@@ -1253,7 +1215,6 @@ struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 #define item_moved(ih,path) comp_items(ih, path)
 #define path_changed(ih,path) comp_items (ih, path)
 
-
 /***************************************************************************/
 /*                       MISC                                              */
 /***************************************************************************/
@@ -1272,30 +1233,26 @@ struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET, .reada = 0,}
 // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset
 #define U32_MAX (~(__u32)0)
 
-static inline loff_t max_reiserfs_offset (struct inode * inode)
+static inline loff_t max_reiserfs_offset(struct inode *inode)
 {
-    if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
-	return (loff_t)U32_MAX;
+	if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5)
+		return (loff_t) U32_MAX;
 
-    return (loff_t)((~(__u64)0) >> 4);
+	return (loff_t) ((~(__u64) 0) >> 4);
 }
 
-
 /*#define MAX_KEY_UNIQUENESS	MAX_UL_INT*/
 #define MAX_KEY_OBJECTID	MAX_UL_INT
 
-
 #define MAX_B_NUM  MAX_UL_INT
 #define MAX_FC_NUM MAX_US_INT
 
-
 /* the purpose is to detect overflow of an unsigned short */
 #define REISERFS_LINK_MAX (MAX_US_INT - 1000)
 
-
 /* The following defines are used in reiserfs_insert_item and reiserfs_append_item  */
-#define REISERFS_KERNEL_MEM		0	/* reiserfs kernel memory mode	*/
-#define REISERFS_USER_MEM		1	/* reiserfs user memory mode		*/
+#define REISERFS_KERNEL_MEM		0	/* reiserfs kernel memory mode  */
+#define REISERFS_USER_MEM		1	/* reiserfs user memory mode            */
 
 #define fs_generation(s) (REISERFS_SB(s)->s_generation_counter)
 #define get_generation(s) atomic_read (&fs_generation(s))
@@ -1303,7 +1260,6 @@ static inline loff_t max_reiserfs_offset (struct inode * inode)
 #define __fs_changed(gen,s) (gen != get_generation (s))
 #define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);})
 
-
 /***************************************************************************/
 /*                  FIXATE NODES                                           */
 /***************************************************************************/
@@ -1324,38 +1280,34 @@ static inline loff_t max_reiserfs_offset (struct inode * inode)
    calculating what we can shift to neighbors and how many nodes we
    have to have if we do not any shiftings, if we shift to left/right
    neighbor or to both. */
-struct virtual_item
-{
-    int vi_index; // index in the array of item operations
-    unsigned short vi_type;	// left/right mergeability
-    unsigned short vi_item_len;           /* length of item that it will have after balancing */
-    struct item_head * vi_ih;
-    const char * vi_item;     // body of item (old or new)
-    const void * vi_new_data; // 0 always but paste mode
-    void * vi_uarea;    // item specific area
+struct virtual_item {
+	int vi_index;		// index in the array of item operations
+	unsigned short vi_type;	// left/right mergeability
+	unsigned short vi_item_len;	/* length of item that it will have after balancing */
+	struct item_head *vi_ih;
+	const char *vi_item;	// body of item (old or new)
+	const void *vi_new_data;	// 0 always but paste mode
+	void *vi_uarea;		// item specific area
 };
 
-
-struct virtual_node
-{
-  char * vn_free_ptr;		/* this is a pointer to the free space in the buffer */
-  unsigned short vn_nr_item;	/* number of items in virtual node */
-  short vn_size;        	/* size of node , that node would have if it has unlimited size and no balancing is performed */
-  short vn_mode;		/* mode of balancing (paste, insert, delete, cut) */
-  short vn_affected_item_num; 
-  short vn_pos_in_item;
-  struct item_head * vn_ins_ih;	/* item header of inserted item, 0 for other modes */
-  const void * vn_data;
-  struct virtual_item * vn_vi;	/* array of items (including a new one, excluding item to be deleted) */
+struct virtual_node {
+	char *vn_free_ptr;	/* this is a pointer to the free space in the buffer */
+	unsigned short vn_nr_item;	/* number of items in virtual node */
+	short vn_size;		/* size of node , that node would have if it has unlimited size and no balancing is performed */
+	short vn_mode;		/* mode of balancing (paste, insert, delete, cut) */
+	short vn_affected_item_num;
+	short vn_pos_in_item;
+	struct item_head *vn_ins_ih;	/* item header of inserted item, 0 for other modes */
+	const void *vn_data;
+	struct virtual_item *vn_vi;	/* array of items (including a new one, excluding item to be deleted) */
 };
 
 /* used by directory items when creating virtual nodes */
 struct direntry_uarea {
-    int flags;
-    __u16 entry_count;
-    __u16 entry_sizes[1];
-} __attribute__ ((__packed__)) ;
-
+	int flags;
+	__u16 entry_count;
+	__u16 entry_sizes[1];
+} __attribute__ ((__packed__));
 
 /***************************************************************************/
 /*                  TREE BALANCE                                           */
@@ -1378,73 +1330,72 @@ struct direntry_uarea {
 #define MAX_AMOUNT_NEEDED 2
 
 /* someday somebody will prefix every field in this struct with tb_ */
-struct tree_balance
-{
-  int tb_mode;
-  int need_balance_dirty;
-  struct super_block * tb_sb;
-  struct reiserfs_transaction_handle *transaction_handle ;
-  struct path * tb_path;
-  struct buffer_head * L[MAX_HEIGHT];        /* array of left neighbors of nodes in the path */
-  struct buffer_head * R[MAX_HEIGHT];        /* array of right neighbors of nodes in the path*/
-  struct buffer_head * FL[MAX_HEIGHT];       /* array of fathers of the left  neighbors      */
-  struct buffer_head * FR[MAX_HEIGHT];       /* array of fathers of the right neighbors      */
-  struct buffer_head * CFL[MAX_HEIGHT];      /* array of common parents of center node and its left neighbor  */
-  struct buffer_head * CFR[MAX_HEIGHT];      /* array of common parents of center node and its right neighbor */
-
-  struct buffer_head * FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals
-					     cur_blknum. */
-  struct buffer_head * used[MAX_FEB_SIZE];
-  struct buffer_head * thrown[MAX_FEB_SIZE];
-  int lnum[MAX_HEIGHT];	/* array of number of items which must be
-			   shifted to the left in order to balance the
-			   current node; for leaves includes item that
-			   will be partially shifted; for internal
-			   nodes, it is the number of child pointers
-			   rather than items. It includes the new item
-			   being created. The code sometimes subtracts
-			   one to get the number of wholly shifted
-			   items for other purposes. */
-  int rnum[MAX_HEIGHT];	/* substitute right for left in comment above */
-  int lkey[MAX_HEIGHT];               /* array indexed by height h mapping the key delimiting L[h] and
-					       S[h] to its item number within the node CFL[h] */
-  int rkey[MAX_HEIGHT];               /* substitute r for l in comment above */
-  int insert_size[MAX_HEIGHT];        /* the number of bytes by we are trying to add or remove from
-					       S[h]. A negative value means removing.  */
-  int blknum[MAX_HEIGHT];             /* number of nodes that will replace node S[h] after
-					       balancing on the level h of the tree.  If 0 then S is
-					       being deleted, if 1 then S is remaining and no new nodes
-					       are being created, if 2 or 3 then 1 or 2 new nodes is
-					       being created */
-
-  /* fields that are used only for balancing leaves of the tree */
-  int cur_blknum;	/* number of empty blocks having been already allocated			*/
-  int s0num;             /* number of items that fall into left most  node when S[0] splits	*/
-  int s1num;             /* number of items that fall into first  new node when S[0] splits	*/
-  int s2num;             /* number of items that fall into second new node when S[0] splits	*/
-  int lbytes;            /* number of bytes which can flow to the left neighbor from the	left	*/
-  /* most liquid item that cannot be shifted from S[0] entirely		*/
-  /* if -1 then nothing will be partially shifted */
-  int rbytes;            /* number of bytes which will flow to the right neighbor from the right	*/
-  /* most liquid item that cannot be shifted from S[0] entirely		*/
-  /* if -1 then nothing will be partially shifted                           */
-  int s1bytes;		/* number of bytes which flow to the first  new node when S[0] splits	*/
-            			/* note: if S[0] splits into 3 nodes, then items do not need to be cut	*/
-  int s2bytes;
-  struct buffer_head * buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */
-  char * vn_buf;		/* kmalloced memory. Used to create
+struct tree_balance {
+	int tb_mode;
+	int need_balance_dirty;
+	struct super_block *tb_sb;
+	struct reiserfs_transaction_handle *transaction_handle;
+	struct path *tb_path;
+	struct buffer_head *L[MAX_HEIGHT];	/* array of left neighbors of nodes in the path */
+	struct buffer_head *R[MAX_HEIGHT];	/* array of right neighbors of nodes in the path */
+	struct buffer_head *FL[MAX_HEIGHT];	/* array of fathers of the left  neighbors      */
+	struct buffer_head *FR[MAX_HEIGHT];	/* array of fathers of the right neighbors      */
+	struct buffer_head *CFL[MAX_HEIGHT];	/* array of common parents of center node and its left neighbor  */
+	struct buffer_head *CFR[MAX_HEIGHT];	/* array of common parents of center node and its right neighbor */
+
+	struct buffer_head *FEB[MAX_FEB_SIZE];	/* array of empty buffers. Number of buffers in array equals
+						   cur_blknum. */
+	struct buffer_head *used[MAX_FEB_SIZE];
+	struct buffer_head *thrown[MAX_FEB_SIZE];
+	int lnum[MAX_HEIGHT];	/* array of number of items which must be
+				   shifted to the left in order to balance the
+				   current node; for leaves includes item that
+				   will be partially shifted; for internal
+				   nodes, it is the number of child pointers
+				   rather than items. It includes the new item
+				   being created. The code sometimes subtracts
+				   one to get the number of wholly shifted
+				   items for other purposes. */
+	int rnum[MAX_HEIGHT];	/* substitute right for left in comment above */
+	int lkey[MAX_HEIGHT];	/* array indexed by height h mapping the key delimiting L[h] and
+				   S[h] to its item number within the node CFL[h] */
+	int rkey[MAX_HEIGHT];	/* substitute r for l in comment above */
+	int insert_size[MAX_HEIGHT];	/* the number of bytes by we are trying to add or remove from
+					   S[h]. A negative value means removing.  */
+	int blknum[MAX_HEIGHT];	/* number of nodes that will replace node S[h] after
+				   balancing on the level h of the tree.  If 0 then S is
+				   being deleted, if 1 then S is remaining and no new nodes
+				   are being created, if 2 or 3 then 1 or 2 new nodes is
+				   being created */
+
+	/* fields that are used only for balancing leaves of the tree */
+	int cur_blknum;		/* number of empty blocks having been already allocated                 */
+	int s0num;		/* number of items that fall into left most  node when S[0] splits     */
+	int s1num;		/* number of items that fall into first  new node when S[0] splits     */
+	int s2num;		/* number of items that fall into second new node when S[0] splits     */
+	int lbytes;		/* number of bytes which can flow to the left neighbor from the        left    */
+	/* most liquid item that cannot be shifted from S[0] entirely         */
+	/* if -1 then nothing will be partially shifted */
+	int rbytes;		/* number of bytes which will flow to the right neighbor from the right        */
+	/* most liquid item that cannot be shifted from S[0] entirely         */
+	/* if -1 then nothing will be partially shifted                           */
+	int s1bytes;		/* number of bytes which flow to the first  new node when S[0] splits   */
+	/* note: if S[0] splits into 3 nodes, then items do not need to be cut  */
+	int s2bytes;
+	struct buffer_head *buf_to_free[MAX_FREE_BLOCK];	/* buffers which are to be freed after do_balance finishes by unfix_nodes */
+	char *vn_buf;		/* kmalloced memory. Used to create
 				   virtual node and keep map of
 				   dirtied bitmap blocks */
-  int vn_buf_size;		/* size of the vn_buf */
-  struct virtual_node * tb_vn;	/* VN starts after bitmap of bitmap blocks */
+	int vn_buf_size;	/* size of the vn_buf */
+	struct virtual_node *tb_vn;	/* VN starts after bitmap of bitmap blocks */
 
-  int fs_gen;                  /* saved value of `reiserfs_generation' counter
-			          see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */
+	int fs_gen;		/* saved value of `reiserfs_generation' counter
+				   see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */
 #ifdef DISPLACE_NEW_PACKING_LOCALITIES
-  struct in_core_key  key;	      /* key pointer, to pass to block allocator or
-				 another low-level subsystem */
+	struct in_core_key key;	/* key pointer, to pass to block allocator or
+				   another low-level subsystem */
 #endif
-} ;
+};
 
 /* These are modes of balancing */
 
@@ -1479,13 +1430,12 @@ struct tree_balance
 /* used in do_balance for passing parent of node information that has
    been gotten from tb struct */
 struct buffer_info {
-    struct tree_balance * tb;
-    struct buffer_head * bi_bh;
-    struct buffer_head * bi_parent;
-    int bi_position;
+	struct tree_balance *tb;
+	struct buffer_head *bi_bh;
+	struct buffer_head *bi_parent;
+	int bi_position;
 };
 
-
 /* there are 4 types of items: stat data, directory item, indirect, direct.
 +-------------------+------------+--------------+------------+
 |	            |  k_offset  | k_uniqueness | mergeable? |
@@ -1503,24 +1453,24 @@ struct buffer_info {
 */
 
 struct item_operations {
-    int (*bytes_number) (struct item_head * ih, int block_size);
-    void (*decrement_key) (struct cpu_key *);
-    int (*is_left_mergeable) (struct reiserfs_key * ih, unsigned long bsize);
-    void (*print_item) (struct item_head *, char * item);
-    void (*check_item) (struct item_head *, char * item);
-
-    int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, 
-		      int is_affected, int insert_size);
-    int (*check_left) (struct virtual_item * vi, int free, 
-			    int start_skip, int end_skip);
-    int (*check_right) (struct virtual_item * vi, int free);
-    int (*part_size) (struct virtual_item * vi, int from, int to);
-    int (*unit_num) (struct virtual_item * vi);
-    void (*print_vi) (struct virtual_item * vi);
+	int (*bytes_number) (struct item_head * ih, int block_size);
+	void (*decrement_key) (struct cpu_key *);
+	int (*is_left_mergeable) (struct reiserfs_key * ih,
+				  unsigned long bsize);
+	void (*print_item) (struct item_head *, char *item);
+	void (*check_item) (struct item_head *, char *item);
+
+	int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi,
+			  int is_affected, int insert_size);
+	int (*check_left) (struct virtual_item * vi, int free,
+			   int start_skip, int end_skip);
+	int (*check_right) (struct virtual_item * vi, int free);
+	int (*part_size) (struct virtual_item * vi, int from, int to);
+	int (*unit_num) (struct virtual_item * vi);
+	void (*print_vi) (struct virtual_item * vi);
 };
 
-
-extern struct item_operations * item_ops [TYPE_ANY + 1];
+extern struct item_operations *item_ops[TYPE_ANY + 1];
 
 #define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
 #define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
@@ -1533,8 +1483,6 @@ extern struct item_operations * item_ops [TYPE_ANY + 1];
 #define op_unit_num(vi)				     item_ops[(vi)->vi_index]->unit_num (vi)
 #define op_print_vi(vi)                              item_ops[(vi)->vi_index]->print_vi (vi)
 
-
-
 #define COMP_SHORT_KEYS comp_short_keys
 
 /* number of blocks pointed to by the indirect item */
@@ -1545,8 +1493,7 @@ extern struct item_operations * item_ops [TYPE_ANY + 1];
 
 /* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */
 
-
-/* get the item header */ 
+/* get the item header */
 #define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) )
 
 /* get key */
@@ -1577,9 +1524,9 @@ extern struct item_operations * item_ops [TYPE_ANY + 1];
 #define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0)
 
 struct reiserfs_iget_args {
-    __u32 objectid ;
-    __u32 dirid ;
-} ;
+	__u32 objectid;
+	__u32 dirid;
+};
 
 /***************************************************************************/
 /*                    FUNCTION DECLARATIONS                                */
@@ -1595,11 +1542,11 @@ struct reiserfs_iget_args {
 
 /* first block written in a commit.  */
 struct reiserfs_journal_desc {
-  __le32 j_trans_id ;			/* id of commit */
-  __le32 j_len ;			/* length of commit. len +1 is the commit block */
-  __le32 j_mount_id ;				/* mount id of this trans*/
-  __le32 j_realblock[1] ; /* real locations for each block */
-} ;
+	__le32 j_trans_id;	/* id of commit */
+	__le32 j_len;		/* length of commit. len +1 is the commit block */
+	__le32 j_mount_id;	/* mount id of this trans */
+	__le32 j_realblock[1];	/* real locations for each block */
+};
 
 #define get_desc_trans_id(d)   le32_to_cpu((d)->j_trans_id)
 #define get_desc_trans_len(d)  le32_to_cpu((d)->j_len)
@@ -1611,10 +1558,10 @@ struct reiserfs_journal_desc {
 
 /* last block written in a commit */
 struct reiserfs_journal_commit {
-  __le32 j_trans_id ;			/* must match j_trans_id from the desc block */
-  __le32 j_len ;			/* ditto */
-  __le32 j_realblock[1] ; /* real locations for each block */
-} ;
+	__le32 j_trans_id;	/* must match j_trans_id from the desc block */
+	__le32 j_len;		/* ditto */
+	__le32 j_realblock[1];	/* real locations for each block */
+};
 
 #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id)
 #define get_commit_trans_len(c)        le32_to_cpu((c)->j_len)
@@ -1628,19 +1575,19 @@ struct reiserfs_journal_commit {
 ** and this transaction does not need to be replayed.
 */
 struct reiserfs_journal_header {
-  __le32 j_last_flush_trans_id ;		/* id of last fully flushed transaction */
-  __le32 j_first_unflushed_offset ;      /* offset in the log of where to start replay after a crash */
-  __le32 j_mount_id ;
-  /* 12 */ struct journal_params jh_journal;
-} ;
+	__le32 j_last_flush_trans_id;	/* id of last fully flushed transaction */
+	__le32 j_first_unflushed_offset;	/* offset in the log of where to start replay after a crash */
+	__le32 j_mount_id;
+	/* 12 */ struct journal_params jh_journal;
+};
 
 /* biggest tunable defines are right here */
-#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */
-#define JOURNAL_TRANS_MAX_DEFAULT 1024   /* biggest possible single transaction, don't change for now (8/3/99) */
+#define JOURNAL_BLOCK_COUNT 8192	/* number of blocks in the journal */
+#define JOURNAL_TRANS_MAX_DEFAULT 1024	/* biggest possible single transaction, don't change for now (8/3/99) */
 #define JOURNAL_TRANS_MIN_DEFAULT 256
-#define JOURNAL_MAX_BATCH_DEFAULT   900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */
+#define JOURNAL_MAX_BATCH_DEFAULT   900	/* max blocks to batch into one transaction, don't make this any bigger than 900 */
 #define JOURNAL_MIN_RATIO 2
-#define JOURNAL_MAX_COMMIT_AGE 30 
+#define JOURNAL_MAX_COMMIT_AGE 30
 #define JOURNAL_MAX_TRANS_AGE 30
 #define JOURNAL_PER_BALANCE_CNT (3 * (MAX_HEIGHT-2) + 9)
 #ifdef CONFIG_QUOTA
@@ -1664,10 +1611,10 @@ struct reiserfs_journal_header {
 ** the current number of nodes is > max, the node is freed, otherwise, 
 ** it is put on a free list for faster use later.
 */
-#define REISERFS_MIN_BITMAP_NODES 10 
-#define REISERFS_MAX_BITMAP_NODES 100 
+#define REISERFS_MIN_BITMAP_NODES 10
+#define REISERFS_MAX_BITMAP_NODES 100
 
-#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */
+#define JBH_HASH_SHIFT 13	/* these are based on journal hash size of 8192 */
 #define JBH_HASH_MASK 8191
 
 #define _jhashfn(sb,block)	\
@@ -1681,14 +1628,14 @@ struct reiserfs_journal_header {
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
 enum reiserfs_bh_state_bits {
-    BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */
-    BH_JDirty_wait,
-    BH_JNew,                     /* disk block was taken off free list before
-                                  * being in a finished transaction, or
-                                  * written to disk. Can be reused immed. */
-    BH_JPrepared,
-    BH_JRestore_dirty,
-    BH_JTest, // debugging only will go away
+	BH_JDirty = BH_PrivateStart,	/* buffer is in current transaction */
+	BH_JDirty_wait,
+	BH_JNew,		/* disk block was taken off free list before
+				 * being in a finished transaction, or
+				 * written to disk. Can be reused immed. */
+	BH_JPrepared,
+	BH_JRestore_dirty,
+	BH_JTest,		// debugging only will go away
 };
 
 BUFFER_FNS(JDirty, journaled);
@@ -1708,175 +1655,192 @@ TAS_BUFFER_FNS(JTest, journal_test);
 ** transaction handle which is passed around for all journal calls
 */
 struct reiserfs_transaction_handle {
-  struct super_block *t_super ; /* super for this FS when journal_begin was
-				   called. saves calls to reiserfs_get_super
-				   also used by nested transactions to make
-				   sure they are nesting on the right FS
-				   _must_ be first in the handle
-				*/
-  int t_refcount;
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  void *t_handle_save ;		/* save existing current->journal_info */
-  unsigned displace_new_blocks:1; /* if new block allocation occurres, that block
-				   should be displaced from others */
-  struct list_head t_list;
-} ;
+	struct super_block *t_super;	/* super for this FS when journal_begin was
+					   called. saves calls to reiserfs_get_super
+					   also used by nested transactions to make
+					   sure they are nesting on the right FS
+					   _must_ be first in the handle
+					 */
+	int t_refcount;
+	int t_blocks_logged;	/* number of blocks this writer has logged */
+	int t_blocks_allocated;	/* number of blocks this writer allocated */
+	unsigned long t_trans_id;	/* sanity check, equals the current trans id */
+	void *t_handle_save;	/* save existing current->journal_info */
+	unsigned displace_new_blocks:1;	/* if new block allocation occurres, that block
+					   should be displaced from others */
+	struct list_head t_list;
+};
 
 /* used to keep track of ordered and tail writes, attached to the buffer
  * head through b_journal_head.
  */
 struct reiserfs_jh {
-    struct reiserfs_journal_list *jl;
-    struct buffer_head *bh;
-    struct list_head list;
+	struct reiserfs_journal_list *jl;
+	struct buffer_head *bh;
+	struct list_head list;
 };
 
 void reiserfs_free_jh(struct buffer_head *bh);
 int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
 int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-
-static inline int
-reiserfs_file_data_log(struct inode *inode) {
-    if (reiserfs_data_log(inode->i_sb) ||
-       (REISERFS_I(inode)->i_flags & i_data_log))
-        return 1 ;
-    return 0 ;
+int journal_mark_dirty(struct reiserfs_transaction_handle *,
+		       struct super_block *, struct buffer_head *bh);
+
+static inline int reiserfs_file_data_log(struct inode *inode)
+{
+	if (reiserfs_data_log(inode->i_sb) ||
+	    (REISERFS_I(inode)->i_flags & i_data_log))
+		return 1;
+	return 0;
 }
 
-static inline int reiserfs_transaction_running(struct super_block *s) {
-    struct reiserfs_transaction_handle *th = current->journal_info ;
-    if (th && th->t_super == s)
-        return 1 ;
-    if (th && th->t_super == NULL)
-        BUG();
-    return 0 ;
+static inline int reiserfs_transaction_running(struct super_block *s)
+{
+	struct reiserfs_transaction_handle *th = current->journal_info;
+	if (th && th->t_super == s)
+		return 1;
+	if (th && th->t_super == NULL)
+		BUG();
+	return 0;
 }
 
 int reiserfs_async_progress_wait(struct super_block *s);
 
-struct reiserfs_transaction_handle *
-reiserfs_persistent_transaction(struct super_block *, int count);
+struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
+								    super_block
+								    *,
+								    int count);
 int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
 int reiserfs_commit_page(struct inode *inode, struct page *page,
-		unsigned from, unsigned to);
+			 unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
-int reiserfs_commit_for_inode(struct inode *) ;
-int  reiserfs_inode_needs_commit(struct inode *) ;
-void reiserfs_update_inode_transaction(struct inode *) ;
-void reiserfs_wait_on_write_block(struct super_block *s) ;
-void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
-void reiserfs_allow_writes(struct super_block *s) ;
-void reiserfs_check_lock_depth(struct super_block *s, char *caller) ;
-int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
-void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
-int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ;
-int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
-int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ;
-int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
-int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
-int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
-int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-int journal_join_abort(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-void reiserfs_journal_abort (struct super_block *sb, int errno);
-void reiserfs_abort (struct super_block *sb, int errno, const char *fmt, ...);
-int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ;
-
-void add_save_link (struct reiserfs_transaction_handle * th,
-					struct inode * inode, int truncate);
-int remove_save_link (struct inode * inode, int truncate);
+int reiserfs_commit_for_inode(struct inode *);
+int reiserfs_inode_needs_commit(struct inode *);
+void reiserfs_update_inode_transaction(struct inode *);
+void reiserfs_wait_on_write_block(struct super_block *s);
+void reiserfs_block_writes(struct reiserfs_transaction_handle *th);
+void reiserfs_allow_writes(struct super_block *s);
+void reiserfs_check_lock_depth(struct super_block *s, char *caller);
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh,
+				 int wait);
+void reiserfs_restore_prepared_buffer(struct super_block *,
+				      struct buffer_head *bh);
+int journal_init(struct super_block *, const char *j_dev_name, int old_format,
+		 unsigned int);
+int journal_release(struct reiserfs_transaction_handle *, struct super_block *);
+int journal_release_error(struct reiserfs_transaction_handle *,
+			  struct super_block *);
+int journal_end(struct reiserfs_transaction_handle *, struct super_block *,
+		unsigned long);
+int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *,
+		     unsigned long);
+int journal_mark_freed(struct reiserfs_transaction_handle *,
+		       struct super_block *, b_blocknr_t blocknr);
+int journal_transaction_should_end(struct reiserfs_transaction_handle *, int);
+int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr,
+			int searchall, b_blocknr_t * next);
+int journal_begin(struct reiserfs_transaction_handle *,
+		  struct super_block *p_s_sb, unsigned long);
+int journal_join_abort(struct reiserfs_transaction_handle *,
+		       struct super_block *p_s_sb, unsigned long);
+void reiserfs_journal_abort(struct super_block *sb, int errno);
+void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
+int reiserfs_allocate_list_bitmaps(struct super_block *s,
+				   struct reiserfs_list_bitmap *, int);
+
+void add_save_link(struct reiserfs_transaction_handle *th,
+		   struct inode *inode, int truncate);
+int remove_save_link(struct inode *inode, int truncate);
 
 /* objectid.c */
-__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th);
-void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, __u32 objectid_to_release);
-int reiserfs_convert_objectid_map_v1(struct super_block *) ;
+__u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th);
+void reiserfs_release_objectid(struct reiserfs_transaction_handle *th,
+			       __u32 objectid_to_release);
+int reiserfs_convert_objectid_map_v1(struct super_block *);
 
 /* stree.c */
 int B_IS_IN_TREE(const struct buffer_head *);
-extern void copy_item_head(struct item_head * p_v_to,
-								  const struct item_head * p_v_from);
+extern void copy_item_head(struct item_head *p_v_to,
+			   const struct item_head *p_v_from);
 
 // first key is in cpu form, second - le
-extern int  comp_short_keys (const struct reiserfs_key * le_key,
-				    const struct cpu_key * cpu_key);
-extern void le_key2cpu_key (struct cpu_key * to, const struct reiserfs_key * from);
+extern int comp_short_keys(const struct reiserfs_key *le_key,
+			   const struct cpu_key *cpu_key);
+extern void le_key2cpu_key(struct cpu_key *to, const struct reiserfs_key *from);
 
 // both are in le form
-extern int comp_le_keys (const struct reiserfs_key *, const struct reiserfs_key *);
-extern int comp_short_le_keys (const struct reiserfs_key *, const struct reiserfs_key *);
+extern int comp_le_keys(const struct reiserfs_key *,
+			const struct reiserfs_key *);
+extern int comp_short_le_keys(const struct reiserfs_key *,
+			      const struct reiserfs_key *);
 
 //
 // get key version from on disk key - kludge
 //
-static inline int le_key_version (const struct reiserfs_key * key)
+static inline int le_key_version(const struct reiserfs_key *key)
 {
-    int type;
-    
-    type = offset_v2_k_type( &(key->u.k_offset_v2));
-    if (type != TYPE_DIRECT && type != TYPE_INDIRECT && type != TYPE_DIRENTRY)
-	return KEY_FORMAT_3_5;
-
-    return KEY_FORMAT_3_6;
-	
-}
+	int type;
 
+	type = offset_v2_k_type(&(key->u.k_offset_v2));
+	if (type != TYPE_DIRECT && type != TYPE_INDIRECT
+	    && type != TYPE_DIRENTRY)
+		return KEY_FORMAT_3_5;
+
+	return KEY_FORMAT_3_6;
 
-static inline void copy_key (struct reiserfs_key *to, const struct reiserfs_key *from)
-{
-    memcpy (to, from, KEY_SIZE);
 }
 
+static inline void copy_key(struct reiserfs_key *to,
+			    const struct reiserfs_key *from)
+{
+	memcpy(to, from, KEY_SIZE);
+}
 
-int comp_items (const struct item_head * stored_ih, const struct path * p_s_path);
-const struct reiserfs_key * get_rkey (const struct path * p_s_chk_path,
-							 const struct super_block  * p_s_sb);
-int search_by_key (struct super_block *, const struct cpu_key *, 
-				   struct path *, int);
+int comp_items(const struct item_head *stored_ih, const struct path *p_s_path);
+const struct reiserfs_key *get_rkey(const struct path *p_s_chk_path,
+				    const struct super_block *p_s_sb);
+int search_by_key(struct super_block *, const struct cpu_key *,
+		  struct path *, int);
 #define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL)
-int search_for_position_by_key (struct super_block * p_s_sb, 
-								const struct cpu_key * p_s_cpu_key, 
-								struct path * p_s_search_path);
-extern void decrement_bcount (struct buffer_head * p_s_bh);
-void decrement_counters_in_path (struct path * p_s_search_path);
-void pathrelse (struct path * p_s_search_path);
-int reiserfs_check_path(struct path *p) ;
-void pathrelse_and_restore (struct super_block *s, struct path * p_s_search_path);
-
-int reiserfs_insert_item (struct reiserfs_transaction_handle *th, 
-			  struct path * path, 
-			  const struct cpu_key * key,
-			  struct item_head * ih,
-			  struct inode *inode, const char * body);
-
-int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th,
-			      struct path * path,
-			      const struct cpu_key * key,
-			      struct inode *inode,
-			      const char * body, int paste_size);
-
-int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th,
-			    struct path * path,
-			    struct cpu_key * key,
-			    struct inode * inode,
-			    struct page *page,
-			    loff_t new_file_size);
-
-int reiserfs_delete_item (struct reiserfs_transaction_handle *th,
-			  struct path * path, 
-			  const struct cpu_key * key,
-			  struct inode * inode, 
-			  struct buffer_head  * p_s_un_bh);
-
-void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th,
-			struct inode *inode, struct reiserfs_key * key);
-int reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode);
-int reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
-			   struct  inode * p_s_inode, struct page *, 
-			   int update_timestamps);
+int search_for_position_by_key(struct super_block *p_s_sb,
+			       const struct cpu_key *p_s_cpu_key,
+			       struct path *p_s_search_path);
+extern void decrement_bcount(struct buffer_head *p_s_bh);
+void decrement_counters_in_path(struct path *p_s_search_path);
+void pathrelse(struct path *p_s_search_path);
+int reiserfs_check_path(struct path *p);
+void pathrelse_and_restore(struct super_block *s, struct path *p_s_search_path);
+
+int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
+			 struct path *path,
+			 const struct cpu_key *key,
+			 struct item_head *ih,
+			 struct inode *inode, const char *body);
+
+int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th,
+			     struct path *path,
+			     const struct cpu_key *key,
+			     struct inode *inode,
+			     const char *body, int paste_size);
+
+int reiserfs_cut_from_item(struct reiserfs_transaction_handle *th,
+			   struct path *path,
+			   struct cpu_key *key,
+			   struct inode *inode,
+			   struct page *page, loff_t new_file_size);
+
+int reiserfs_delete_item(struct reiserfs_transaction_handle *th,
+			 struct path *path,
+			 const struct cpu_key *key,
+			 struct inode *inode, struct buffer_head *p_s_un_bh);
+
+void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
+				struct inode *inode, struct reiserfs_key *key);
+int reiserfs_delete_object(struct reiserfs_transaction_handle *th,
+			   struct inode *p_s_inode);
+int reiserfs_do_truncate(struct reiserfs_transaction_handle *th,
+			 struct inode *p_s_inode, struct page *,
+			 int update_timestamps);
 
 #define i_block_size(inode) ((inode)->i_sb->s_blocksize)
 #define file_size(inode) ((inode)->i_size)
@@ -1885,66 +1849,67 @@ int reiserfs_do_truncate (struct reiserfs_transaction_handle *th,
 #define tail_has_to_be_packed(inode) (have_large_tails ((inode)->i_sb)?\
 !STORE_TAIL_IN_UNFM_S1(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):have_small_tails ((inode)->i_sb)?!STORE_TAIL_IN_UNFM_S2(file_size (inode), tail_size(inode), inode->i_sb->s_blocksize):0 )
 
-void padd_item (char * item, int total_length, int length);
+void padd_item(char *item, int total_length, int length);
 
 /* inode.c */
 /* args for the create parameter of reiserfs_get_block */
-#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */
-#define GET_BLOCK_CREATE 1    /* add anything you need to find block */
-#define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
-#define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
-#define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
-#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
-
-int restart_transaction(struct reiserfs_transaction_handle *th, struct inode *inode, struct path *path);
-void reiserfs_read_locked_inode(struct inode * inode, struct reiserfs_iget_args *args) ;
-int reiserfs_find_actor(struct inode * inode, void *p) ;
-int reiserfs_init_locked_inode(struct inode * inode, void *p) ;
-void reiserfs_delete_inode (struct inode * inode);
-int reiserfs_write_inode (struct inode * inode, int) ;
-int reiserfs_get_block (struct inode * inode, sector_t block, struct buffer_head * bh_result, int create);
-struct dentry *reiserfs_get_dentry(struct super_block *, void *) ;
-struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 *data,
-                                     int len, int fhtype,
-				  int (*acceptable)(void *contect, struct dentry *de),
-				  void *context) ;
-int reiserfs_encode_fh( struct dentry *dentry, __u32 *data, int *lenp, 
-						int connectable );
-
-int reiserfs_truncate_file(struct inode *, int update_timestamps) ;
-void make_cpu_key (struct cpu_key * cpu_key, struct inode * inode, loff_t offset,
-		   int type, int key_length);
-void make_le_item_head (struct item_head * ih, const struct cpu_key * key, 
-			int version,
-			loff_t offset, int type, int length, int entry_count);
-struct inode * reiserfs_iget (struct super_block * s, 
-			      const struct cpu_key * key);
-
-
-int reiserfs_new_inode (struct reiserfs_transaction_handle *th, 
-				   struct inode * dir, int mode, 
-				   const char * symname, loff_t i_size,
-				   struct dentry *dentry, struct inode *inode);
-
-void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th,
-                              struct inode * inode, loff_t size);
+#define GET_BLOCK_NO_CREATE 0	/* don't create new blocks or convert tails */
+#define GET_BLOCK_CREATE 1	/* add anything you need to find block */
+#define GET_BLOCK_NO_HOLE 2	/* return -ENOENT for file holes */
+#define GET_BLOCK_READ_DIRECT 4	/* read the tail if indirect item not found */
+#define GET_BLOCK_NO_ISEM     8	/* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16	/* don't leave any transactions running */
+
+int restart_transaction(struct reiserfs_transaction_handle *th,
+			struct inode *inode, struct path *path);
+void reiserfs_read_locked_inode(struct inode *inode,
+				struct reiserfs_iget_args *args);
+int reiserfs_find_actor(struct inode *inode, void *p);
+int reiserfs_init_locked_inode(struct inode *inode, void *p);
+void reiserfs_delete_inode(struct inode *inode);
+int reiserfs_write_inode(struct inode *inode, int);
+int reiserfs_get_block(struct inode *inode, sector_t block,
+		       struct buffer_head *bh_result, int create);
+struct dentry *reiserfs_get_dentry(struct super_block *, void *);
+struct dentry *reiserfs_decode_fh(struct super_block *sb, __u32 * data,
+				  int len, int fhtype,
+				  int (*acceptable) (void *contect,
+						     struct dentry * de),
+				  void *context);
+int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
+		       int connectable);
+
+int reiserfs_truncate_file(struct inode *, int update_timestamps);
+void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
+		  int type, int key_length);
+void make_le_item_head(struct item_head *ih, const struct cpu_key *key,
+		       int version,
+		       loff_t offset, int type, int length, int entry_count);
+struct inode *reiserfs_iget(struct super_block *s, const struct cpu_key *key);
+
+int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
+		       struct inode *dir, int mode,
+		       const char *symname, loff_t i_size,
+		       struct dentry *dentry, struct inode *inode);
+
+void reiserfs_update_sd_size(struct reiserfs_transaction_handle *th,
+			     struct inode *inode, loff_t size);
 
 static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th,
-                                      struct inode *inode)
+				      struct inode *inode)
 {
-    reiserfs_update_sd_size(th, inode, inode->i_size) ;
+	reiserfs_update_sd_size(th, inode, inode->i_size);
 }
 
-void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode );
-void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs );
+void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode);
+void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs);
 int reiserfs_setattr(struct dentry *dentry, struct iattr *attr);
 
 /* namei.c */
-void set_de_name_and_namelen (struct reiserfs_dir_entry * de);
-int search_by_entry_key (struct super_block * sb, const struct cpu_key * key, 
-			 struct path * path, 
-			 struct reiserfs_dir_entry * de);
-struct dentry *reiserfs_get_parent(struct dentry *) ;
+void set_de_name_and_namelen(struct reiserfs_dir_entry *de);
+int search_by_entry_key(struct super_block *sb, const struct cpu_key *key,
+			struct path *path, struct reiserfs_dir_entry *de);
+struct dentry *reiserfs_get_parent(struct dentry *);
 /* procfs.c */
 
 #if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
@@ -1953,15 +1918,15 @@ struct dentry *reiserfs_get_parent(struct dentry *) ;
 #undef REISERFS_PROC_INFO
 #endif
 
-int reiserfs_proc_info_init( struct super_block *sb );
-int reiserfs_proc_info_done( struct super_block *sb );
-struct proc_dir_entry *reiserfs_proc_register_global( char *name, 
-													  read_proc_t *func );
-void reiserfs_proc_unregister_global( const char *name );
-int reiserfs_proc_info_global_init( void );
-int reiserfs_proc_info_global_done( void );
-int reiserfs_global_version_in_proc( char *buffer, char **start, off_t offset,
-									 int count, int *eof, void *data );
+int reiserfs_proc_info_init(struct super_block *sb);
+int reiserfs_proc_info_done(struct super_block *sb);
+struct proc_dir_entry *reiserfs_proc_register_global(char *name,
+						     read_proc_t * func);
+void reiserfs_proc_unregister_global(const char *name);
+int reiserfs_proc_info_global_init(void);
+int reiserfs_proc_info_global_done(void);
+int reiserfs_global_version_in_proc(char *buffer, char **start, off_t offset,
+				    int count, int *eof, void *data);
 
 #if defined( REISERFS_PROC_INFO )
 
@@ -1993,123 +1958,132 @@ extern struct inode_operations reiserfs_special_inode_operations;
 extern struct file_operations reiserfs_dir_operations;
 
 /* tail_conversion.c */
-int direct2indirect (struct reiserfs_transaction_handle *, struct inode *, struct path *, struct buffer_head *, loff_t);
-int indirect2direct (struct reiserfs_transaction_handle *, struct inode *, struct page *, struct path *, const struct cpu_key *, loff_t, char *);
-void reiserfs_unmap_buffer(struct buffer_head *) ;
-
+int direct2indirect(struct reiserfs_transaction_handle *, struct inode *,
+		    struct path *, struct buffer_head *, loff_t);
+int indirect2direct(struct reiserfs_transaction_handle *, struct inode *,
+		    struct page *, struct path *, const struct cpu_key *,
+		    loff_t, char *);
+void reiserfs_unmap_buffer(struct buffer_head *);
 
 /* file.c */
 extern struct inode_operations reiserfs_file_inode_operations;
 extern struct file_operations reiserfs_file_operations;
-extern struct address_space_operations reiserfs_address_space_operations ;
+extern struct address_space_operations reiserfs_address_space_operations;
 
 /* fix_nodes.c */
 #ifdef CONFIG_REISERFS_CHECK
-void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s);
-void reiserfs_kfree (const void * vp, size_t size, struct super_block * s);
+void *reiserfs_kmalloc(size_t size, int flags, struct super_block *s);
+void reiserfs_kfree(const void *vp, size_t size, struct super_block *s);
 #else
 static inline void *reiserfs_kmalloc(size_t size, int flags,
-					struct super_block *s)
+				     struct super_block *s)
 {
 	return kmalloc(size, flags);
 }
 
 static inline void reiserfs_kfree(const void *vp, size_t size,
-					struct super_block *s)
+				  struct super_block *s)
 {
 	kfree(vp);
 }
 #endif
 
-int fix_nodes (int n_op_mode, struct tree_balance * p_s_tb, 
-	       struct item_head * p_s_ins_ih, const void *);
-void unfix_nodes (struct tree_balance *);
-
+int fix_nodes(int n_op_mode, struct tree_balance *p_s_tb,
+	      struct item_head *p_s_ins_ih, const void *);
+void unfix_nodes(struct tree_balance *);
 
 /* prints.c */
-void reiserfs_panic (struct super_block * s, const char * fmt, ...) __attribute__ ( ( noreturn ) );
-void reiserfs_info (struct super_block *s, const char * fmt, ...);
-void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...);
-void print_indirect_item (struct buffer_head * bh, int item_num);
-void store_print_tb (struct tree_balance * tb);
-void print_cur_tb (char * mes);
-void print_de (struct reiserfs_dir_entry * de);
-void print_bi (struct buffer_info * bi, char * mes);
-#define PRINT_LEAF_ITEMS 1   /* print all items */
-#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */
-#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */
-void print_block (struct buffer_head * bh, ...);
-void print_bmap (struct super_block * s, int silent);
-void print_bmap_block (int i, char * data, int size, int silent);
+void reiserfs_panic(struct super_block *s, const char *fmt, ...)
+    __attribute__ ((noreturn));
+void reiserfs_info(struct super_block *s, const char *fmt, ...);
+void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...);
+void print_indirect_item(struct buffer_head *bh, int item_num);
+void store_print_tb(struct tree_balance *tb);
+void print_cur_tb(char *mes);
+void print_de(struct reiserfs_dir_entry *de);
+void print_bi(struct buffer_info *bi, char *mes);
+#define PRINT_LEAF_ITEMS 1	/* print all items */
+#define PRINT_DIRECTORY_ITEMS 2	/* print directory items */
+#define PRINT_DIRECT_ITEMS 4	/* print contents of direct items */
+void print_block(struct buffer_head *bh, ...);
+void print_bmap(struct super_block *s, int silent);
+void print_bmap_block(int i, char *data, int size, int silent);
 /*void print_super_block (struct super_block * s, char * mes);*/
-void print_objectid_map (struct super_block * s);
-void print_block_head (struct buffer_head * bh, char * mes);
-void check_leaf (struct buffer_head * bh);
-void check_internal (struct buffer_head * bh);
-void print_statistics (struct super_block * s);
-char * reiserfs_hashname(int code);
+void print_objectid_map(struct super_block *s);
+void print_block_head(struct buffer_head *bh, char *mes);
+void check_leaf(struct buffer_head *bh);
+void check_internal(struct buffer_head *bh);
+void print_statistics(struct super_block *s);
+char *reiserfs_hashname(int code);
 
 /* lbalance.c */
-int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew);
-int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes);
-int leaf_shift_right (struct tree_balance * tb, int shift_num, int shift_bytes);
-void leaf_delete_items (struct buffer_info * cur_bi, int last_first, int first, int del_num, int del_bytes);
-void leaf_insert_into_buf (struct buffer_info * bi, int before, 
-                           struct item_head * inserted_item_ih, const char * inserted_item_body, int zeros_number);
-void leaf_paste_in_buffer (struct buffer_info * bi, int pasted_item_num, 
-                           int pos_in_item, int paste_size, const char * body, int zeros_number);
-void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, int pos_in_item, 
-                           int cut_size);
-void leaf_paste_entries (struct buffer_head * bh, int item_num, int before, 
-                         int new_entry_count, struct reiserfs_de_head * new_dehs, const char * records, int paste_size);
+int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num,
+		    int mov_bytes, struct buffer_head *Snew);
+int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes);
+int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes);
+void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first,
+		       int del_num, int del_bytes);
+void leaf_insert_into_buf(struct buffer_info *bi, int before,
+			  struct item_head *inserted_item_ih,
+			  const char *inserted_item_body, int zeros_number);
+void leaf_paste_in_buffer(struct buffer_info *bi, int pasted_item_num,
+			  int pos_in_item, int paste_size, const char *body,
+			  int zeros_number);
+void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num,
+			  int pos_in_item, int cut_size);
+void leaf_paste_entries(struct buffer_head *bh, int item_num, int before,
+			int new_entry_count, struct reiserfs_de_head *new_dehs,
+			const char *records, int paste_size);
 /* ibalance.c */
-int balance_internal (struct tree_balance * , int, int, struct item_head * , 
-                      struct buffer_head **);
+int balance_internal(struct tree_balance *, int, int, struct item_head *,
+		     struct buffer_head **);
 
 /* do_balance.c */
-void do_balance_mark_leaf_dirty (struct tree_balance * tb,
-					struct buffer_head * bh, int flag);
+void do_balance_mark_leaf_dirty(struct tree_balance *tb,
+				struct buffer_head *bh, int flag);
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
 #define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty
 
-void do_balance (struct tree_balance * tb, struct item_head * ih, 
-                 const char * body, int flag);
-void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh);
+void do_balance(struct tree_balance *tb, struct item_head *ih,
+		const char *body, int flag);
+void reiserfs_invalidate_buffer(struct tree_balance *tb,
+				struct buffer_head *bh);
 
-int get_left_neighbor_position (struct tree_balance * tb, int h);
-int get_right_neighbor_position (struct tree_balance * tb, int h);
-void replace_key (struct tree_balance * tb, struct buffer_head *, int, struct buffer_head *, int);
-void make_empty_node (struct buffer_info *);
-struct buffer_head * get_FEB (struct tree_balance *);
+int get_left_neighbor_position(struct tree_balance *tb, int h);
+int get_right_neighbor_position(struct tree_balance *tb, int h);
+void replace_key(struct tree_balance *tb, struct buffer_head *, int,
+		 struct buffer_head *, int);
+void make_empty_node(struct buffer_info *);
+struct buffer_head *get_FEB(struct tree_balance *);
 
 /* bitmap.c */
 
 /* structure contains hints for block allocator, and it is a container for
  * arguments, such as node, search path, transaction_handle, etc. */
- struct __reiserfs_blocknr_hint {
-     struct inode * inode;		/* inode passed to allocator, if we allocate unf. nodes */
-     long block;			/* file offset, in blocks */
-     struct in_core_key key;
-     struct path * path;		/* search path, used by allocator to deternine search_start by
-					 * various ways */
-     struct reiserfs_transaction_handle * th; /* transaction handle is needed to log super blocks and
-					       * bitmap blocks changes  */
-     b_blocknr_t beg, end;
-     b_blocknr_t search_start;		/* a field used to transfer search start value (block number)
+struct __reiserfs_blocknr_hint {
+	struct inode *inode;	/* inode passed to allocator, if we allocate unf. nodes */
+	long block;		/* file offset, in blocks */
+	struct in_core_key key;
+	struct path *path;	/* search path, used by allocator to deternine search_start by
+				 * various ways */
+	struct reiserfs_transaction_handle *th;	/* transaction handle is needed to log super blocks and
+						 * bitmap blocks changes  */
+	b_blocknr_t beg, end;
+	b_blocknr_t search_start;	/* a field used to transfer search start value (block number)
 					 * between different block allocator procedures
 					 * (determine_search_start() and others) */
-    int prealloc_size;			/* is set in determine_prealloc_size() function, used by underlayed
-					 * function that do actual allocation */
+	int prealloc_size;	/* is set in determine_prealloc_size() function, used by underlayed
+				 * function that do actual allocation */
 
-    unsigned formatted_node:1;		/* the allocator uses different polices for getting disk space for
+	unsigned formatted_node:1;	/* the allocator uses different polices for getting disk space for
 					 * formatted/unformatted blocks with/without preallocation */
-    unsigned preallocate:1;
+	unsigned preallocate:1;
 };
 
 typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
 
-int reiserfs_parse_alloc_options (struct super_block *, char *);
-void reiserfs_init_alloc_options (struct super_block *s);
+int reiserfs_parse_alloc_options(struct super_block *, char *);
+void reiserfs_init_alloc_options(struct super_block *s);
 
 /*
  * given a directory, this will tell you what packing locality
@@ -2118,68 +2092,72 @@ void reiserfs_init_alloc_options (struct super_block *s);
  */
 __le32 reiserfs_choose_packing(struct inode *dir);
 
-int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
-void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted);
-int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
-extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb,
-					      b_blocknr_t *new_blocknrs, int amount_needed)
+int is_reusable(struct super_block *s, b_blocknr_t block, int bit_value);
+void reiserfs_free_block(struct reiserfs_transaction_handle *th, struct inode *,
+			 b_blocknr_t, int for_unformatted);
+int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t *, int,
+			       int);
+extern inline int reiserfs_new_form_blocknrs(struct tree_balance *tb,
+					     b_blocknr_t * new_blocknrs,
+					     int amount_needed)
 {
-    reiserfs_blocknr_hint_t hint = {
-	.th = tb->transaction_handle,
-	.path = tb->tb_path,
-	.inode = NULL,
-	.key = tb->key,
-	.block = 0,
-	.formatted_node = 1
-    };
-    return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed, 0);
+	reiserfs_blocknr_hint_t hint = {
+		.th = tb->transaction_handle,
+		.path = tb->tb_path,
+		.inode = NULL,
+		.key = tb->key,
+		.block = 0,
+		.formatted_node = 1
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, amount_needed,
+					  0);
 }
 
-extern inline int reiserfs_new_unf_blocknrs (struct reiserfs_transaction_handle *th,
-					     struct inode *inode,
-					     b_blocknr_t *new_blocknrs,
-					     struct path * path, long block)
+extern inline int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle
+					    *th, struct inode *inode,
+					    b_blocknr_t * new_blocknrs,
+					    struct path *path, long block)
 {
-    reiserfs_blocknr_hint_t hint = {
-	.th = th,
-	.path = path,
-	.inode = inode,
-	.block = block,
-	.formatted_node = 0,
-	.preallocate = 0
-    };
-    return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+	reiserfs_blocknr_hint_t hint = {
+		.th = th,
+		.path = path,
+		.inode = inode,
+		.block = block,
+		.formatted_node = 0,
+		.preallocate = 0
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
 }
 
 #ifdef REISERFS_PREALLOCATE
-extern inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle *th,
-					     struct inode * inode,
-					     b_blocknr_t *new_blocknrs,
-					     struct path * path, long block)
+extern inline int reiserfs_new_unf_blocknrs2(struct reiserfs_transaction_handle
+					     *th, struct inode *inode,
+					     b_blocknr_t * new_blocknrs,
+					     struct path *path, long block)
 {
-    reiserfs_blocknr_hint_t hint = {
-	.th = th,
-	.path = path,
-	.inode = inode,
-	.block = block,
-	.formatted_node = 0,
-	.preallocate = 1
-    };
-    return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
+	reiserfs_blocknr_hint_t hint = {
+		.th = th,
+		.path = path,
+		.inode = inode,
+		.block = block,
+		.formatted_node = 0,
+		.preallocate = 1
+	};
+	return reiserfs_allocate_blocknrs(&hint, new_blocknrs, 1, 0);
 }
 
-void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, 
-				struct inode * inode);
-void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th);
+void reiserfs_discard_prealloc(struct reiserfs_transaction_handle *th,
+			       struct inode *inode);
+void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th);
 #endif
-void reiserfs_claim_blocks_to_be_allocated( struct super_block *sb, int blocks);
-void reiserfs_release_claimed_blocks( struct super_block *sb, int blocks);
+void reiserfs_claim_blocks_to_be_allocated(struct super_block *sb, int blocks);
+void reiserfs_release_claimed_blocks(struct super_block *sb, int blocks);
 int reiserfs_can_fit_pages(struct super_block *sb);
 
 /* hashes.c */
-__u32 keyed_hash (const signed char *msg, int len);
-__u32 yura_hash (const signed char *msg, int len);
-__u32 r5_hash (const signed char *msg, int len);
+__u32 keyed_hash(const signed char *msg, int len);
+__u32 yura_hash(const signed char *msg, int len);
+__u32 r5_hash(const signed char *msg, int len);
 
 /* the ext2 bit routines adjust for big or little endian as
 ** appropriate for the arch, so in our laziness we use them rather
@@ -2199,11 +2177,10 @@ __u32 r5_hash (const signed char *msg, int len);
    absolutely safe */
 #define SPARE_SPACE 500
 
-
 /* prototypes from ioctl.c */
-int reiserfs_ioctl (struct inode * inode, struct file * filp, 
- 		    unsigned int cmd, unsigned long arg);
- 
+int reiserfs_ioctl(struct inode *inode, struct file *filp,
+		   unsigned int cmd, unsigned long arg);
+
 /* ioctl's command */
 #define REISERFS_IOC_UNPACK		_IOW(0xCD,1,long)
 /* define following flags to be the same as in ext2, so that chattr(1),
@@ -2218,10 +2195,8 @@ int reiserfs_ioctl (struct inode * inode, struct file * filp,
    would evolve into real per-fs locks */
 #define reiserfs_write_lock( sb ) lock_kernel()
 #define reiserfs_write_unlock( sb ) unlock_kernel()
- 			         
+
 /* xattr stuff */
 #define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
 
-#endif /* _LINUX_REISER_FS_H */
-
-
+#endif				/* _LINUX_REISER_FS_H */
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index e321eb050d65..149be8d9a0c9 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -10,54 +10,53 @@ typedef enum {
     /** this says what format of key do all items (but stat data) of
       an object have.  If this is set, that format is 3.6 otherwise
       - 3.5 */
-    i_item_key_version_mask    =  0x0001,
+	i_item_key_version_mask = 0x0001,
     /** If this is unset, object has 3.5 stat data, otherwise, it has
       3.6 stat data with 64bit size, 32bit nlink etc. */
-    i_stat_data_version_mask   =  0x0002,
+	i_stat_data_version_mask = 0x0002,
     /** file might need tail packing on close */
-    i_pack_on_close_mask       =  0x0004,
+	i_pack_on_close_mask = 0x0004,
     /** don't pack tail of file */
-    i_nopack_mask              =  0x0008,
+	i_nopack_mask = 0x0008,
     /** If those is set, "safe link" was created for this file during
       truncate or unlink. Safe link is used to avoid leakage of disk
       space on crash with some files open, but unlinked. */
-    i_link_saved_unlink_mask   =  0x0010,
-    i_link_saved_truncate_mask =  0x0020,
-    i_has_xattr_dir            =  0x0040,
-    i_data_log	               =  0x0080,
+	i_link_saved_unlink_mask = 0x0010,
+	i_link_saved_truncate_mask = 0x0020,
+	i_has_xattr_dir = 0x0040,
+	i_data_log = 0x0080,
 } reiserfs_inode_flags;
 
-
 struct reiserfs_inode_info {
-    __u32 i_key [4];/* key is still 4 32 bit integers */
+	__u32 i_key[4];		/* key is still 4 32 bit integers */
     /** transient inode flags that are never stored on disk. Bitmasks
       for this field are defined above. */
-    __u32 i_flags;
+	__u32 i_flags;
 
-    __u32 i_first_direct_byte; // offset of first byte stored in direct item.
+	__u32 i_first_direct_byte;	// offset of first byte stored in direct item.
 
-    /* copy of persistent inode flags read from sd_attrs. */
-    __u32 i_attrs;
+	/* copy of persistent inode flags read from sd_attrs. */
+	__u32 i_attrs;
 
-    int i_prealloc_block; /* first unused block of a sequence of unused blocks */
-    int i_prealloc_count; /* length of that sequence */
-    struct list_head i_prealloc_list; /* per-transaction list of inodes which
-                                       * have preallocated blocks */
+	int i_prealloc_block;	/* first unused block of a sequence of unused blocks */
+	int i_prealloc_count;	/* length of that sequence */
+	struct list_head i_prealloc_list;	/* per-transaction list of inodes which
+						 * have preallocated blocks */
 
-    unsigned new_packing_locality:1;  /* new_packig_locality is created; new blocks
-				       * for the contents of this directory should be
-				       * displaced */
+	unsigned new_packing_locality:1;	/* new_packig_locality is created; new blocks
+						 * for the contents of this directory should be
+						 * displaced */
 
-    /* we use these for fsync or O_SYNC to decide which transaction
-    ** needs to be committed in order for this inode to be properly
-    ** flushed */
-    unsigned long i_trans_id ;
-    struct reiserfs_journal_list *i_jl;
+	/* we use these for fsync or O_SYNC to decide which transaction
+	 ** needs to be committed in order for this inode to be properly
+	 ** flushed */
+	unsigned long i_trans_id;
+	struct reiserfs_journal_list *i_jl;
 
-    struct posix_acl *i_acl_access;
-    struct posix_acl *i_acl_default;
-    struct rw_semaphore xattr_sem;
-    struct inode vfs_inode;
+	struct posix_acl *i_acl_access;
+	struct posix_acl *i_acl_default;
+	struct rw_semaphore xattr_sem;
+	struct inode vfs_inode;
 };
 
 #endif
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 31c709d0fe18..3e68592e52e9 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -10,7 +10,7 @@
 #endif
 
 typedef enum {
-  reiserfs_attrs_cleared	= 0x00000001,
+	reiserfs_attrs_cleared = 0x00000001,
 } reiserfs_super_block_flags;
 
 /* struct reiserfs_super_block accessors/mutators
@@ -61,7 +61,7 @@ typedef enum {
 #define sb_umount_state(sbp)       (le16_to_cpu((sbp)->s_v1.s_umount_state))
 #define set_sb_umount_state(sbp,v) ((sbp)->s_v1.s_umount_state = cpu_to_le16(v))
 #define sb_fs_state(sbp)           (le16_to_cpu((sbp)->s_v1.s_fs_state))
-#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v)) 
+#define set_sb_fs_state(sbp,v)     ((sbp)->s_v1.s_fs_state = cpu_to_le16(v))
 #define sb_hash_function_code(sbp) \
               (le32_to_cpu((sbp)->s_v1.s_hash_function_code))
 #define set_sb_hash_function_code(sbp,v) \
@@ -103,10 +103,10 @@ typedef enum {
 
 /* don't mess with these for a while */
 				/* we have a node size define somewhere in reiserfs_fs.h. -Hans */
-#define JOURNAL_BLOCK_SIZE  4096 /* BUG gotta get rid of this */
-#define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
-#define JOURNAL_HASH_SIZE 8192   
-#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
+#define JOURNAL_BLOCK_SIZE  4096	/* BUG gotta get rid of this */
+#define JOURNAL_MAX_CNODE   1500	/* max cnodes to allocate. */
+#define JOURNAL_HASH_SIZE 8192
+#define JOURNAL_NUM_BITMAPS 5	/* number of copies of the bitmaps to have floating.  Must be >= 2 */
 
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
@@ -117,27 +117,27 @@ typedef enum {
 ** to a given transaction.
 */
 struct reiserfs_journal_cnode {
-  struct buffer_head *bh ;		 /* real buffer head */
-  struct super_block *sb ;		 /* dev of real buffer head */
-  __u32 blocknr ;		 /* block number of real buffer head, == 0 when buffer on disk */
-  long state ;
-  struct reiserfs_journal_list *jlist ;  /* journal list this cnode lives in */
-  struct reiserfs_journal_cnode *next ;  /* next in transaction list */
-  struct reiserfs_journal_cnode *prev ;  /* prev in transaction list */
-  struct reiserfs_journal_cnode *hprev ; /* prev in hash list */
-  struct reiserfs_journal_cnode *hnext ; /* next in hash list */
+	struct buffer_head *bh;	/* real buffer head */
+	struct super_block *sb;	/* dev of real buffer head */
+	__u32 blocknr;		/* block number of real buffer head, == 0 when buffer on disk */
+	long state;
+	struct reiserfs_journal_list *jlist;	/* journal list this cnode lives in */
+	struct reiserfs_journal_cnode *next;	/* next in transaction list */
+	struct reiserfs_journal_cnode *prev;	/* prev in transaction list */
+	struct reiserfs_journal_cnode *hprev;	/* prev in hash list */
+	struct reiserfs_journal_cnode *hnext;	/* next in hash list */
 };
 
 struct reiserfs_bitmap_node {
-  int id ;
-  char *data ;
-  struct list_head list ;
-} ;
+	int id;
+	char *data;
+	struct list_head list;
+};
 
 struct reiserfs_list_bitmap {
-  struct reiserfs_journal_list *journal_list ;
-  struct reiserfs_bitmap_node **bitmaps ;
-} ;
+	struct reiserfs_journal_list *journal_list;
+	struct reiserfs_bitmap_node **bitmaps;
+};
 
 /*
 ** one of these for each transaction.  The most important part here is the j_realblock.
@@ -146,273 +146,269 @@ struct reiserfs_list_bitmap {
 ** and to make sure every real block in a transaction is on disk before allowing the log area
 ** to be overwritten */
 struct reiserfs_journal_list {
-  unsigned long j_start ;
-  unsigned long j_state;
-  unsigned long j_len ;
-  atomic_t j_nonzerolen ;
-  atomic_t j_commit_left ;
-  atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
-  struct semaphore j_commit_lock;
-  unsigned long j_trans_id ;
-  time_t j_timestamp ;
-  struct reiserfs_list_bitmap *j_list_bitmap ;
-  struct buffer_head *j_commit_bh ; /* commit buffer head */
-  struct reiserfs_journal_cnode *j_realblock  ;
-  struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  /* time ordered list of all active transactions */
-  struct list_head j_list;
-
-  /* time ordered list of all transactions we haven't tried to flush yet */
-  struct list_head j_working_list;
-
-  /* list of tail conversion targets in need of flush before commit */
-  struct list_head j_tail_bh_list;
-  /* list of data=ordered buffers in need of flush before commit */
-  struct list_head j_bh_list;
-  int j_refcount;
-} ;
+	unsigned long j_start;
+	unsigned long j_state;
+	unsigned long j_len;
+	atomic_t j_nonzerolen;
+	atomic_t j_commit_left;
+	atomic_t j_older_commits_done;	/* all commits older than this on disk */
+	struct semaphore j_commit_lock;
+	unsigned long j_trans_id;
+	time_t j_timestamp;
+	struct reiserfs_list_bitmap *j_list_bitmap;
+	struct buffer_head *j_commit_bh;	/* commit buffer head */
+	struct reiserfs_journal_cnode *j_realblock;
+	struct reiserfs_journal_cnode *j_freedlist;	/* list of buffers that were freed during this trans.  free each of these on flush */
+	/* time ordered list of all active transactions */
+	struct list_head j_list;
+
+	/* time ordered list of all transactions we haven't tried to flush yet */
+	struct list_head j_working_list;
+
+	/* list of tail conversion targets in need of flush before commit */
+	struct list_head j_tail_bh_list;
+	/* list of data=ordered buffers in need of flush before commit */
+	struct list_head j_bh_list;
+	int j_refcount;
+};
 
 struct reiserfs_journal {
-  struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */
-  struct reiserfs_journal_cnode *j_last ; /* newest journal block */
-  struct reiserfs_journal_cnode *j_first ; /*  oldest journal block.  start here for traverse */
-
-  struct file         *j_dev_file;
-  struct block_device *j_dev_bd;  
-  int j_1st_reserved_block;     /* first block on s_dev of reserved area journal */        
-	
-  long j_state ;			
-  unsigned long j_trans_id ;
-  unsigned long j_mount_id ;
-  unsigned long j_start ;             /* start of current waiting commit (index into j_ap_blocks) */
-  unsigned long j_len ;               /* lenght of current waiting commit */
-  unsigned long j_len_alloc ;         /* number of buffers requested by journal_begin() */
-  atomic_t j_wcount ;            /* count of writers for current commit */
-  unsigned long j_bcount ;            /* batch count. allows turning X transactions into 1 */
-  unsigned long j_first_unflushed_offset ;  /* first unflushed transactions offset */
-  unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
-  struct buffer_head *j_header_bh ;   
-
-  time_t j_trans_start_time ;         /* time this transaction started */
-  struct semaphore j_lock;
-  struct semaphore j_flush_sem;
-  wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
-  atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_list_bitmap_index ;	      /* number of next list bitmap to use */
-  int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
-  int j_next_full_flush ;             /* next journal_end will flush all journal list */
-  int j_next_async_flush ;             /* next journal_end will flush all async commits */
-
-  int j_cnode_used ;	      /* number of cnodes on the used list */
-  int j_cnode_free ;          /* number of cnodes on the free list */
-
-  unsigned int j_trans_max ;           /* max number of blocks in a transaction.  */
-  unsigned int j_max_batch ;           /* max number of blocks to batch into a trans */
-  unsigned int j_max_commit_age ;      /* in seconds, how old can an async commit be */
-  unsigned int j_max_trans_age ;       /* in seconds, how old can a transaction be */
-  unsigned int j_default_max_commit_age ; /* the default for the max commit age */
-
-  struct reiserfs_journal_cnode *j_cnode_free_list ;
-  struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
-
-  struct reiserfs_journal_list *j_current_jl;
-  int j_free_bitmap_nodes ;
-  int j_used_bitmap_nodes ;
-
-  int j_num_lists;      /* total number of active transactions */
-  int j_num_work_lists; /* number that need attention from kreiserfsd */
-
-  /* debugging to make sure things are flushed in order */
-  int j_last_flush_id;
-
-  /* debugging to make sure things are committed in order */
-  int j_last_commit_id;
-
-  struct list_head j_bitmap_nodes ;
-  struct list_head j_dirty_buffers ;
-  spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */
-
-  /* list of all active transactions */
-  struct list_head j_journal_list;
-  /* lists that haven't been touched by writeback attempts */
-  struct list_head j_working_list;
-
-  struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
-  struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
-  										the transactions */
-  struct list_head j_prealloc_list;     /* list of inodes which have preallocated blocks */
-  int j_persistent_trans;
-  unsigned long j_max_trans_size ;
-  unsigned long j_max_batch_size ;
-
-  int j_errno;
-
-  /* when flushing ordered buffers, throttle new ordered writers */
-  struct work_struct j_work;
-  atomic_t j_async_throttle;
+	struct buffer_head **j_ap_blocks;	/* journal blocks on disk */
+	struct reiserfs_journal_cnode *j_last;	/* newest journal block */
+	struct reiserfs_journal_cnode *j_first;	/*  oldest journal block.  start here for traverse */
+
+	struct file *j_dev_file;
+	struct block_device *j_dev_bd;
+	int j_1st_reserved_block;	/* first block on s_dev of reserved area journal */
+
+	long j_state;
+	unsigned long j_trans_id;
+	unsigned long j_mount_id;
+	unsigned long j_start;	/* start of current waiting commit (index into j_ap_blocks) */
+	unsigned long j_len;	/* lenght of current waiting commit */
+	unsigned long j_len_alloc;	/* number of buffers requested by journal_begin() */
+	atomic_t j_wcount;	/* count of writers for current commit */
+	unsigned long j_bcount;	/* batch count. allows turning X transactions into 1 */
+	unsigned long j_first_unflushed_offset;	/* first unflushed transactions offset */
+	unsigned long j_last_flush_trans_id;	/* last fully flushed journal timestamp */
+	struct buffer_head *j_header_bh;
+
+	time_t j_trans_start_time;	/* time this transaction started */
+	struct semaphore j_lock;
+	struct semaphore j_flush_sem;
+	wait_queue_head_t j_join_wait;	/* wait for current transaction to finish before starting new one */
+	atomic_t j_jlock;	/* lock for j_join_wait */
+	int j_list_bitmap_index;	/* number of next list bitmap to use */
+	int j_must_wait;	/* no more journal begins allowed. MUST sleep on j_join_wait */
+	int j_next_full_flush;	/* next journal_end will flush all journal list */
+	int j_next_async_flush;	/* next journal_end will flush all async commits */
+
+	int j_cnode_used;	/* number of cnodes on the used list */
+	int j_cnode_free;	/* number of cnodes on the free list */
+
+	unsigned int j_trans_max;	/* max number of blocks in a transaction.  */
+	unsigned int j_max_batch;	/* max number of blocks to batch into a trans */
+	unsigned int j_max_commit_age;	/* in seconds, how old can an async commit be */
+	unsigned int j_max_trans_age;	/* in seconds, how old can a transaction be */
+	unsigned int j_default_max_commit_age;	/* the default for the max commit age */
+
+	struct reiserfs_journal_cnode *j_cnode_free_list;
+	struct reiserfs_journal_cnode *j_cnode_free_orig;	/* orig pointer returned from vmalloc */
+
+	struct reiserfs_journal_list *j_current_jl;
+	int j_free_bitmap_nodes;
+	int j_used_bitmap_nodes;
+
+	int j_num_lists;	/* total number of active transactions */
+	int j_num_work_lists;	/* number that need attention from kreiserfsd */
+
+	/* debugging to make sure things are flushed in order */
+	int j_last_flush_id;
+
+	/* debugging to make sure things are committed in order */
+	int j_last_commit_id;
+
+	struct list_head j_bitmap_nodes;
+	struct list_head j_dirty_buffers;
+	spinlock_t j_dirty_buffers_lock;	/* protects j_dirty_buffers */
+
+	/* list of all active transactions */
+	struct list_head j_journal_list;
+	/* lists that haven't been touched by writeback attempts */
+	struct list_head j_working_list;
+
+	struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS];	/* array of bitmaps to record the deleted blocks */
+	struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE];	/* hash table for real buffer heads in current trans */
+	struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE];	/* hash table for all the real buffer heads in all 
+										   the transactions */
+	struct list_head j_prealloc_list;	/* list of inodes which have preallocated blocks */
+	int j_persistent_trans;
+	unsigned long j_max_trans_size;
+	unsigned long j_max_batch_size;
+
+	int j_errno;
+
+	/* when flushing ordered buffers, throttle new ordered writers */
+	struct work_struct j_work;
+	atomic_t j_async_throttle;
 };
 
 enum journal_state_bits {
-    J_WRITERS_BLOCKED = 1,   /* set when new writers not allowed */
-    J_WRITERS_QUEUED,        /* set when log is full due to too many writers */
-    J_ABORTED,               /* set when log is aborted */
+	J_WRITERS_BLOCKED = 1,	/* set when new writers not allowed */
+	J_WRITERS_QUEUED,	/* set when log is full due to too many writers */
+	J_ABORTED,		/* set when log is aborted */
 };
 
+#define JOURNAL_DESC_MAGIC "ReIsErLB"	/* ick.  magic string to find desc blocks in the journal */
 
-#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
+typedef __u32(*hashf_t) (const signed char *, int);
 
-typedef __u32 (*hashf_t) (const signed char *, int);
-
-struct reiserfs_bitmap_info
-{
-    // FIXME: Won't work with block sizes > 8K
-    __u16  first_zero_hint;
-    __u16  free_count;
-    struct buffer_head *bh; /* the actual bitmap */
+struct reiserfs_bitmap_info {
+	// FIXME: Won't work with block sizes > 8K
+	__u16 first_zero_hint;
+	__u16 free_count;
+	struct buffer_head *bh;	/* the actual bitmap */
 };
 
 struct proc_dir_entry;
 
 #if defined( CONFIG_PROC_FS ) && defined( CONFIG_REISERFS_PROC_INFO )
 typedef unsigned long int stat_cnt_t;
-typedef struct reiserfs_proc_info_data
-{
-  spinlock_t lock;
-  int exiting;
-  int max_hash_collisions;
-
-  stat_cnt_t breads;
-  stat_cnt_t bread_miss;
-  stat_cnt_t search_by_key;
-  stat_cnt_t search_by_key_fs_changed;
-  stat_cnt_t search_by_key_restarted;
-
-  stat_cnt_t insert_item_restarted;
-  stat_cnt_t paste_into_item_restarted;
-  stat_cnt_t cut_from_item_restarted;
-  stat_cnt_t delete_solid_item_restarted;
-  stat_cnt_t delete_item_restarted;
-
-  stat_cnt_t leaked_oid;
-  stat_cnt_t leaves_removable;
-
-  /* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */
-  stat_cnt_t balance_at[ 5 ]; /* XXX */
-  /* sbk == search_by_key */
-  stat_cnt_t sbk_read_at[ 5 ]; /* XXX */
-  stat_cnt_t sbk_fs_changed[ 5 ];
-  stat_cnt_t sbk_restarted[ 5 ];
-  stat_cnt_t items_at[ 5 ]; /* XXX */
-  stat_cnt_t free_at[ 5 ]; /* XXX */
-  stat_cnt_t can_node_be_removed[ 5 ]; /* XXX */
-  long int lnum[ 5 ]; /* XXX */
-  long int rnum[ 5 ]; /* XXX */
-  long int lbytes[ 5 ]; /* XXX */
-  long int rbytes[ 5 ]; /* XXX */
-  stat_cnt_t get_neighbors[ 5 ];
-  stat_cnt_t get_neighbors_restart[ 5 ];
-  stat_cnt_t need_l_neighbor[ 5 ];
-  stat_cnt_t need_r_neighbor[ 5 ];
-
-  stat_cnt_t free_block;
-  struct __scan_bitmap_stats {
-	stat_cnt_t call;
-	stat_cnt_t wait;
-	stat_cnt_t bmap;
-	stat_cnt_t retry;
-	stat_cnt_t in_journal_hint;
-	stat_cnt_t in_journal_nohint;
-	stat_cnt_t stolen;
-  } scan_bitmap;
-  struct __journal_stats {
-	stat_cnt_t in_journal;
-	stat_cnt_t in_journal_bitmap;
-	stat_cnt_t in_journal_reusable;
-	stat_cnt_t lock_journal;
-	stat_cnt_t lock_journal_wait;
-	stat_cnt_t journal_being;
-	stat_cnt_t journal_relock_writers;
-	stat_cnt_t journal_relock_wcount;
-	stat_cnt_t mark_dirty;
-	stat_cnt_t mark_dirty_already;
-	stat_cnt_t mark_dirty_notjournal;
-	stat_cnt_t restore_prepared;
-	stat_cnt_t prepare;
-	stat_cnt_t prepare_retry;
-  } journal;
+typedef struct reiserfs_proc_info_data {
+	spinlock_t lock;
+	int exiting;
+	int max_hash_collisions;
+
+	stat_cnt_t breads;
+	stat_cnt_t bread_miss;
+	stat_cnt_t search_by_key;
+	stat_cnt_t search_by_key_fs_changed;
+	stat_cnt_t search_by_key_restarted;
+
+	stat_cnt_t insert_item_restarted;
+	stat_cnt_t paste_into_item_restarted;
+	stat_cnt_t cut_from_item_restarted;
+	stat_cnt_t delete_solid_item_restarted;
+	stat_cnt_t delete_item_restarted;
+
+	stat_cnt_t leaked_oid;
+	stat_cnt_t leaves_removable;
+
+	/* balances per level. Use explicit 5 as MAX_HEIGHT is not visible yet. */
+	stat_cnt_t balance_at[5];	/* XXX */
+	/* sbk == search_by_key */
+	stat_cnt_t sbk_read_at[5];	/* XXX */
+	stat_cnt_t sbk_fs_changed[5];
+	stat_cnt_t sbk_restarted[5];
+	stat_cnt_t items_at[5];	/* XXX */
+	stat_cnt_t free_at[5];	/* XXX */
+	stat_cnt_t can_node_be_removed[5];	/* XXX */
+	long int lnum[5];	/* XXX */
+	long int rnum[5];	/* XXX */
+	long int lbytes[5];	/* XXX */
+	long int rbytes[5];	/* XXX */
+	stat_cnt_t get_neighbors[5];
+	stat_cnt_t get_neighbors_restart[5];
+	stat_cnt_t need_l_neighbor[5];
+	stat_cnt_t need_r_neighbor[5];
+
+	stat_cnt_t free_block;
+	struct __scan_bitmap_stats {
+		stat_cnt_t call;
+		stat_cnt_t wait;
+		stat_cnt_t bmap;
+		stat_cnt_t retry;
+		stat_cnt_t in_journal_hint;
+		stat_cnt_t in_journal_nohint;
+		stat_cnt_t stolen;
+	} scan_bitmap;
+	struct __journal_stats {
+		stat_cnt_t in_journal;
+		stat_cnt_t in_journal_bitmap;
+		stat_cnt_t in_journal_reusable;
+		stat_cnt_t lock_journal;
+		stat_cnt_t lock_journal_wait;
+		stat_cnt_t journal_being;
+		stat_cnt_t journal_relock_writers;
+		stat_cnt_t journal_relock_wcount;
+		stat_cnt_t mark_dirty;
+		stat_cnt_t mark_dirty_already;
+		stat_cnt_t mark_dirty_notjournal;
+		stat_cnt_t restore_prepared;
+		stat_cnt_t prepare;
+		stat_cnt_t prepare_retry;
+	} journal;
 } reiserfs_proc_info_data_t;
 #else
-typedef struct reiserfs_proc_info_data
-{} reiserfs_proc_info_data_t;
+typedef struct reiserfs_proc_info_data {
+} reiserfs_proc_info_data_t;
 #endif
 
 /* reiserfs union of in-core super block data */
-struct reiserfs_sb_info
-{
-    struct buffer_head * s_sbh;                   /* Buffer containing the super block */
-				/* both the comment and the choice of
-                                   name are unclear for s_rs -Hans */
-    struct reiserfs_super_block * s_rs;           /* Pointer to the super block in the buffer */
-    struct reiserfs_bitmap_info * s_ap_bitmap;
-    struct reiserfs_journal *s_journal ;		/* pointer to journal information */
-    unsigned short s_mount_state;                 /* reiserfs state (valid, invalid) */
-  
-				/* Comment? -Hans */
-    void (*end_io_handler)(struct buffer_head *, int);
-    hashf_t s_hash_function;	/* pointer to function which is used
-                                   to sort names in directory. Set on
-                                   mount */
-    unsigned long s_mount_opt;	/* reiserfs's mount options are set
-                                   here (currently - NOTAIL, NOLOG,
-                                   REPLAYONLY) */
-
-    struct {			/* This is a structure that describes block allocator options */
-	unsigned long bits;	/* Bitfield for enable/disable kind of options */
-	unsigned long large_file_size; /* size started from which we consider file to be a large one(in blocks) */
-	int border;		/* percentage of disk, border takes */
-	int preallocmin;	/* Minimal file size (in blocks) starting from which we do preallocations */
-	int preallocsize;	/* Number of blocks we try to prealloc when file
-				   reaches preallocmin size (in blocks) or
-				   prealloc_list is empty. */
-    } s_alloc_options;
-
-				/* Comment? -Hans */
-    wait_queue_head_t s_wait;
-				/* To be obsoleted soon by per buffer seals.. -Hans */
-    atomic_t s_generation_counter; // increased by one every time the
-    // tree gets re-balanced
-    unsigned long s_properties;    /* File system properties. Currently holds
-				     on-disk FS format */
-    
-    /* session statistics */
-    int s_kmallocs;
-    int s_disk_reads;
-    int s_disk_writes;
-    int s_fix_nodes;
-    int s_do_balance;
-    int s_unneeded_left_neighbor;
-    int s_good_search_by_key_reada;
-    int s_bmaps;
-    int s_bmaps_without_search;
-    int s_direct2indirect;
-    int s_indirect2direct;
+struct reiserfs_sb_info {
+	struct buffer_head *s_sbh;	/* Buffer containing the super block */
+	/* both the comment and the choice of
+	   name are unclear for s_rs -Hans */
+	struct reiserfs_super_block *s_rs;	/* Pointer to the super block in the buffer */
+	struct reiserfs_bitmap_info *s_ap_bitmap;
+	struct reiserfs_journal *s_journal;	/* pointer to journal information */
+	unsigned short s_mount_state;	/* reiserfs state (valid, invalid) */
+
+	/* Comment? -Hans */
+	void (*end_io_handler) (struct buffer_head *, int);
+	hashf_t s_hash_function;	/* pointer to function which is used
+					   to sort names in directory. Set on
+					   mount */
+	unsigned long s_mount_opt;	/* reiserfs's mount options are set
+					   here (currently - NOTAIL, NOLOG,
+					   REPLAYONLY) */
+
+	struct {		/* This is a structure that describes block allocator options */
+		unsigned long bits;	/* Bitfield for enable/disable kind of options */
+		unsigned long large_file_size;	/* size started from which we consider file to be a large one(in blocks) */
+		int border;	/* percentage of disk, border takes */
+		int preallocmin;	/* Minimal file size (in blocks) starting from which we do preallocations */
+		int preallocsize;	/* Number of blocks we try to prealloc when file
+					   reaches preallocmin size (in blocks) or
+					   prealloc_list is empty. */
+	} s_alloc_options;
+
+	/* Comment? -Hans */
+	wait_queue_head_t s_wait;
+	/* To be obsoleted soon by per buffer seals.. -Hans */
+	atomic_t s_generation_counter;	// increased by one every time the
+	// tree gets re-balanced
+	unsigned long s_properties;	/* File system properties. Currently holds
+					   on-disk FS format */
+
+	/* session statistics */
+	int s_kmallocs;
+	int s_disk_reads;
+	int s_disk_writes;
+	int s_fix_nodes;
+	int s_do_balance;
+	int s_unneeded_left_neighbor;
+	int s_good_search_by_key_reada;
+	int s_bmaps;
+	int s_bmaps_without_search;
+	int s_direct2indirect;
+	int s_indirect2direct;
 	/* set up when it's ok for reiserfs_read_inode2() to read from
 	   disk inode with nlink==0. Currently this is only used during
 	   finish_unfinished() processing at mount time */
-    int s_is_unlinked_ok;
-    reiserfs_proc_info_data_t s_proc_info_data;
-    struct proc_dir_entry *procdir;
-    int reserved_blocks; /* amount of blocks reserved for further allocations */
-    spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */
-    struct dentry *priv_root; /* root of /.reiserfs_priv */
-    struct dentry *xattr_root; /* root of /.reiserfs_priv/.xa */
-    struct rw_semaphore xattr_dir_sem;
-
-    int j_errno;
+	int s_is_unlinked_ok;
+	reiserfs_proc_info_data_t s_proc_info_data;
+	struct proc_dir_entry *procdir;
+	int reserved_blocks;	/* amount of blocks reserved for further allocations */
+	spinlock_t bitmap_lock;	/* this lock on now only used to protect reserved_blocks variable */
+	struct dentry *priv_root;	/* root of /.reiserfs_priv */
+	struct dentry *xattr_root;	/* root of /.reiserfs_priv/.xa */
+	struct rw_semaphore xattr_dir_sem;
+
+	int j_errno;
 #ifdef CONFIG_QUOTA
-    char *s_qf_names[MAXQUOTAS];
-    int s_jquota_fmt;
+	char *s_qf_names[MAXQUOTAS];
+	int s_jquota_fmt;
 #endif
 };
 
@@ -422,14 +418,14 @@ struct reiserfs_sb_info
 
 enum reiserfs_mount_options {
 /* Mount options */
-    REISERFS_LARGETAIL,  /* large tails will be created in a session */
-    REISERFS_SMALLTAIL,  /* small (for files less than block size) tails will be created in a session */
-    REPLAYONLY, /* replay journal and return 0. Use by fsck */
-    REISERFS_CONVERT,    /* -o conv: causes conversion of old
-                                 format super block to the new
-                                 format. If not specified - old
-                                 partition will be dealt with in a
-                                 manner of 3.5.x */
+	REISERFS_LARGETAIL,	/* large tails will be created in a session */
+	REISERFS_SMALLTAIL,	/* small (for files less than block size) tails will be created in a session */
+	REPLAYONLY,		/* replay journal and return 0. Use by fsck */
+	REISERFS_CONVERT,	/* -o conv: causes conversion of old
+				   format super block to the new
+				   format. If not specified - old
+				   partition will be dealt with in a
+				   manner of 3.5.x */
 
 /* -o hash={tea, rupasov, r5, detect} is meant for properly mounting 
 ** reiserfs disks from 3.5.19 or earlier.  99% of the time, this option
@@ -439,41 +435,41 @@ enum reiserfs_mount_options {
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-    FORCE_TEA_HASH,      /* try to force tea hash on mount */
-    FORCE_RUPASOV_HASH,  /* try to force rupasov hash on mount */
-    FORCE_R5_HASH,       /* try to force rupasov hash on mount */
-    FORCE_HASH_DETECT,   /* try to detect hash function on mount */
+	FORCE_TEA_HASH,		/* try to force tea hash on mount */
+	FORCE_RUPASOV_HASH,	/* try to force rupasov hash on mount */
+	FORCE_R5_HASH,		/* try to force rupasov hash on mount */
+	FORCE_HASH_DETECT,	/* try to detect hash function on mount */
 
-    REISERFS_DATA_LOG,
-    REISERFS_DATA_ORDERED,
-    REISERFS_DATA_WRITEBACK,
+	REISERFS_DATA_LOG,
+	REISERFS_DATA_ORDERED,
+	REISERFS_DATA_WRITEBACK,
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-    REISERFS_NO_BORDER,
-    REISERFS_NO_UNHASHED_RELOCATION,
-    REISERFS_HASHED_RELOCATION,
-    REISERFS_ATTRS,
-    REISERFS_XATTRS,
-    REISERFS_XATTRS_USER,
-    REISERFS_POSIXACL,
-    REISERFS_BARRIER_NONE,
-    REISERFS_BARRIER_FLUSH,
-
-    /* Actions on error */
-    REISERFS_ERROR_PANIC,
-    REISERFS_ERROR_RO,
-    REISERFS_ERROR_CONTINUE,
-
-    REISERFS_QUOTA,		/* Some quota option specified */
-
-    REISERFS_TEST1,
-    REISERFS_TEST2,
-    REISERFS_TEST3,
-    REISERFS_TEST4,
-    REISERFS_UNSUPPORTED_OPT,
+	REISERFS_NO_BORDER,
+	REISERFS_NO_UNHASHED_RELOCATION,
+	REISERFS_HASHED_RELOCATION,
+	REISERFS_ATTRS,
+	REISERFS_XATTRS,
+	REISERFS_XATTRS_USER,
+	REISERFS_POSIXACL,
+	REISERFS_BARRIER_NONE,
+	REISERFS_BARRIER_FLUSH,
+
+	/* Actions on error */
+	REISERFS_ERROR_PANIC,
+	REISERFS_ERROR_RO,
+	REISERFS_ERROR_CONTINUE,
+
+	REISERFS_QUOTA,		/* Some quota option specified */
+
+	REISERFS_TEST1,
+	REISERFS_TEST2,
+	REISERFS_TEST3,
+	REISERFS_TEST4,
+	REISERFS_UNSUPPORTED_OPT,
 };
 
 #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
@@ -504,18 +500,17 @@ enum reiserfs_mount_options {
 #define reiserfs_error_panic(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_PANIC))
 #define reiserfs_error_ro(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ERROR_RO))
 
-void reiserfs_file_buffer (struct buffer_head * bh, int list);
+void reiserfs_file_buffer(struct buffer_head *bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-int reiserfs_resize(struct super_block *, unsigned long) ;
+int reiserfs_resize(struct super_block *, unsigned long);
 
 #define CARRY_ON                0
 #define SCHEDULE_OCCURRED       1
 
-
 #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
 #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
+#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free)
 #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
 
 #define SB_DISK_JOURNAL_HEAD(s) (SB_JOURNAL(s)->j_header_bh->)
@@ -525,13 +520,14 @@ int reiserfs_resize(struct super_block *, unsigned long) ;
  */
 static inline char *reiserfs_bdevname(struct super_block *s)
 {
-        return (s == NULL) ? "Null superblock" : s -> s_id;
+	return (s == NULL) ? "Null superblock" : s->s_id;
 }
 
 #define reiserfs_is_journal_aborted(journal) (unlikely (__reiserfs_is_journal_aborted (journal)))
-static inline int __reiserfs_is_journal_aborted (struct reiserfs_journal *journal)
+static inline int __reiserfs_is_journal_aborted(struct reiserfs_journal
+						*journal)
 {
-    return test_bit (J_ABORTED, &journal->j_state);
+	return test_bit(J_ABORTED, &journal->j_state);
 }
 
-#endif	/* _LINUX_REISER_FS_SB */
+#endif				/* _LINUX_REISER_FS_SB */
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 9244c5748820..c84354e8374c 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -7,48 +7,48 @@
 #include <linux/xattr.h>
 
 /* Magic value in header */
-#define REISERFS_XATTR_MAGIC 0x52465841 /* "RFXA" */
+#define REISERFS_XATTR_MAGIC 0x52465841	/* "RFXA" */
 
 struct reiserfs_xattr_header {
-    __le32 h_magic;              /* magic number for identification */
-    __le32 h_hash;               /* hash of the value */
+	__le32 h_magic;		/* magic number for identification */
+	__le32 h_hash;		/* hash of the value */
 };
 
 #ifdef __KERNEL__
 
 struct reiserfs_xattr_handler {
 	char *prefix;
-        int (*init)(void);
-        void (*exit)(void);
-	int (*get)(struct inode *inode, const char *name, void *buffer,
-		   size_t size);
-	int (*set)(struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags);
-	int (*del)(struct inode *inode, const char *name);
-        int (*list)(struct inode *inode, const char *name, int namelen, char *out);
-        struct list_head handlers;
+	int (*init) (void);
+	void (*exit) (void);
+	int (*get) (struct inode * inode, const char *name, void *buffer,
+		    size_t size);
+	int (*set) (struct inode * inode, const char *name, const void *buffer,
+		    size_t size, int flags);
+	int (*del) (struct inode * inode, const char *name);
+	int (*list) (struct inode * inode, const char *name, int namelen,
+		     char *out);
+	struct list_head handlers;
 };
 
-
 #ifdef CONFIG_REISERFS_FS_XATTR
 #define is_reiserfs_priv_object(inode) IS_PRIVATE(inode)
 #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir)
-ssize_t reiserfs_getxattr (struct dentry *dentry, const char *name,
-			   void *buffer, size_t size);
-int reiserfs_setxattr (struct dentry *dentry, const char *name,
-                       const void *value, size_t size, int flags);
-ssize_t reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size);
-int reiserfs_removexattr (struct dentry *dentry, const char *name);
-int reiserfs_delete_xattrs (struct inode *inode);
-int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs);
-int reiserfs_xattr_init (struct super_block *sb, int mount_flags);
-int reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd);
-int reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd);
-
-int reiserfs_xattr_del (struct inode *, const char *);
-int reiserfs_xattr_get (const struct inode *, const char *, void *, size_t);
-int reiserfs_xattr_set (struct inode *, const char *, const void *,
-                               size_t, int);
+ssize_t reiserfs_getxattr(struct dentry *dentry, const char *name,
+			  void *buffer, size_t size);
+int reiserfs_setxattr(struct dentry *dentry, const char *name,
+		      const void *value, size_t size, int flags);
+ssize_t reiserfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+int reiserfs_removexattr(struct dentry *dentry, const char *name);
+int reiserfs_delete_xattrs(struct inode *inode);
+int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs);
+int reiserfs_xattr_init(struct super_block *sb, int mount_flags);
+int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd);
+int reiserfs_permission_locked(struct inode *inode, int mask,
+			       struct nameidata *nd);
+
+int reiserfs_xattr_del(struct inode *, const char *);
+int reiserfs_xattr_get(const struct inode *, const char *, void *, size_t);
+int reiserfs_xattr_set(struct inode *, const char *, const void *, size_t, int);
 
 extern struct reiserfs_xattr_handler user_handler;
 extern struct reiserfs_xattr_handler trusted_handler;
@@ -56,57 +56,48 @@ extern struct reiserfs_xattr_handler trusted_handler;
 extern struct reiserfs_xattr_handler security_handler;
 #endif
 
-int reiserfs_xattr_register_handlers (void) __init;
-void reiserfs_xattr_unregister_handlers (void);
+int reiserfs_xattr_register_handlers(void) __init;
+void reiserfs_xattr_unregister_handlers(void);
 
-static inline void
-reiserfs_write_lock_xattrs(struct super_block *sb)
+static inline void reiserfs_write_lock_xattrs(struct super_block *sb)
 {
-    down_write (&REISERFS_XATTR_DIR_SEM(sb));
+	down_write(&REISERFS_XATTR_DIR_SEM(sb));
 }
-static inline void
-reiserfs_write_unlock_xattrs(struct super_block *sb)
+static inline void reiserfs_write_unlock_xattrs(struct super_block *sb)
 {
-    up_write (&REISERFS_XATTR_DIR_SEM(sb));
+	up_write(&REISERFS_XATTR_DIR_SEM(sb));
 }
-static inline void
-reiserfs_read_lock_xattrs(struct super_block *sb)
+static inline void reiserfs_read_lock_xattrs(struct super_block *sb)
 {
-    down_read (&REISERFS_XATTR_DIR_SEM(sb));
+	down_read(&REISERFS_XATTR_DIR_SEM(sb));
 }
 
-static inline void
-reiserfs_read_unlock_xattrs(struct super_block *sb)
+static inline void reiserfs_read_unlock_xattrs(struct super_block *sb)
 {
-    up_read (&REISERFS_XATTR_DIR_SEM(sb));
+	up_read(&REISERFS_XATTR_DIR_SEM(sb));
 }
 
-static inline void
-reiserfs_write_lock_xattr_i(struct inode *inode)
+static inline void reiserfs_write_lock_xattr_i(struct inode *inode)
 {
-    down_write (&REISERFS_I(inode)->xattr_sem);
+	down_write(&REISERFS_I(inode)->xattr_sem);
 }
-static inline void
-reiserfs_write_unlock_xattr_i(struct inode *inode)
+static inline void reiserfs_write_unlock_xattr_i(struct inode *inode)
 {
-    up_write (&REISERFS_I(inode)->xattr_sem);
+	up_write(&REISERFS_I(inode)->xattr_sem);
 }
-static inline void
-reiserfs_read_lock_xattr_i(struct inode *inode)
+static inline void reiserfs_read_lock_xattr_i(struct inode *inode)
 {
-    down_read (&REISERFS_I(inode)->xattr_sem);
+	down_read(&REISERFS_I(inode)->xattr_sem);
 }
 
-static inline void
-reiserfs_read_unlock_xattr_i(struct inode *inode)
+static inline void reiserfs_read_unlock_xattr_i(struct inode *inode)
 {
-    up_read (&REISERFS_I(inode)->xattr_sem);
+	up_read(&REISERFS_I(inode)->xattr_sem);
 }
 
-static inline void
-reiserfs_mark_inode_private(struct inode *inode)
+static inline void reiserfs_mark_inode_private(struct inode *inode)
 {
-    inode->i_flags |= S_PRIVATE;
+	inode->i_flags |= S_PRIVATE;
 }
 
 #else
@@ -127,13 +118,20 @@ reiserfs_mark_inode_private(struct inode *inode)
 #define reiserfs_xattr_register_handlers() 0
 #define reiserfs_xattr_unregister_handlers()
 
-static inline int reiserfs_delete_xattrs (struct inode *inode) { return 0; };
-static inline int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) { return 0; };
-static inline int reiserfs_xattr_init (struct super_block *sb, int mount_flags)
+static inline int reiserfs_delete_xattrs(struct inode *inode)
+{
+	return 0;
+};
+static inline int reiserfs_chown_xattrs(struct inode *inode,
+					struct iattr *attrs)
+{
+	return 0;
+};
+static inline int reiserfs_xattr_init(struct super_block *sb, int mount_flags)
 {
-    sb->s_flags = (sb->s_flags & ~MS_POSIXACL); /* to be sure */
-    return 0;
+	sb->s_flags = (sb->s_flags & ~MS_POSIXACL);	/* to be sure */
+	return 0;
 };
 #endif
 
-#endif  /* __KERNEL__ */
+#endif				/* __KERNEL__ */
-- 
cgit v1.2.3-59-g8ed1b


From 0eeca28300df110bd6ed54b31193c83b87921443 Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Tue, 12 Jul 2005 17:06:03 -0400
Subject: [PATCH] inotify

inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:

        * dnotify requires the opening of one fd per each directory
          that you intend to watch. This quickly results in too many
          open files and pins removable media, preventing unmount.
        * dnotify is directory-based. You only learn about changes to
          directories. Sure, a change to a file in a directory affects
          the directory, but you are then forced to keep a cache of
          stat structures.
        * dnotify's interface to user-space is awful.  Signals?

inotify provides a more usable, simple, powerful solution to file change
notification:

        * inotify's interface is a system call that returns a fd, not SIGIO.
	  You get a single fd, which is select()-able.
        * inotify has an event that says "the filesystem that the item
          you were watching is on was unmounted."
        * inotify can watch directories or files.

Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.

See Documentation/filesystems/inotify.txt.

Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/filesystems/inotify.txt | 138 +++++
 arch/i386/kernel/syscall_table.S      |   3 +
 fs/Kconfig                            |  13 +
 fs/Makefile                           |   1 +
 fs/attr.c                             |  33 +-
 fs/compat.c                           |  12 +-
 fs/file_table.c                       |   3 +
 fs/inode.c                            |   6 +
 fs/inotify.c                          | 999 ++++++++++++++++++++++++++++++++++
 fs/namei.c                            |  30 +-
 fs/nfsd/vfs.c                         |   6 +-
 fs/open.c                             |   3 +-
 fs/read_write.c                       |  15 +-
 fs/sysfs/file.c                       |   7 +-
 fs/xattr.c                            |   5 +-
 include/asm-i386/unistd.h             |   5 +-
 include/linux/fs.h                    |   6 +-
 include/linux/fsnotify.h              | 248 +++++++++
 include/linux/inotify.h               | 108 ++++
 include/linux/sched.h                 |   4 +
 include/linux/sysctl.h                |  11 +-
 kernel/sys_ni.c                       |   3 +
 kernel/sysctl.c                       |  43 +-
 kernel/user.c                         |   4 +
 24 files changed, 1639 insertions(+), 67 deletions(-)
 create mode 100644 Documentation/filesystems/inotify.txt
 create mode 100644 fs/inotify.c
 create mode 100644 include/linux/fsnotify.h
 create mode 100644 include/linux/inotify.h

(limited to 'include/linux')

diff --git a/Documentation/filesystems/inotify.txt b/Documentation/filesystems/inotify.txt
new file mode 100644
index 000000000000..2c716041f578
--- /dev/null
+++ b/Documentation/filesystems/inotify.txt
@@ -0,0 +1,138 @@
+				    inotify
+	     a powerful yet simple file change notification system
+
+
+
+Document started 15 Mar 2005 by Robert Love <rml@novell.com>
+
+(i) User Interface
+
+Inotify is controlled by a set of three sys calls 
+
+First step in using inotify is to initialise an inotify instance
+
+	int fd = inotify_init ();
+
+Change events are managed by "watches".  A watch is an (object,mask) pair where
+the object is a file or directory and the mask is a bit mask of one or more
+inotify events that the application wishes to receive.  See <linux/inotify.h>
+for valid events.  A watch is referenced by a watch descriptor, or wd.
+
+Watches are added via a path to the file.
+
+Watches on a directory will return events on any files inside of the directory.
+
+Adding a watch is simple,
+
+	int wd = inotify_add_watch (fd, path, mask);
+
+You can add a large number of files via something like
+
+	for each file to watch {
+		int wd = inotify_add_watch (fd, file, mask);
+	}
+
+You can update an existing watch in the same manner, by passing in a new mask.
+
+An existing watch is removed via the INOTIFY_IGNORE ioctl, for example
+
+	inotify_rm_watch (fd, wd);
+
+Events are provided in the form of an inotify_event structure that is read(2)
+from a inotify instance fd.  The filename is of dynamic length and follows the 
+struct. It is of size len.  The filename is padded with null bytes to ensure 
+proper alignment.  This padding is reflected in len.
+
+You can slurp multiple events by passing a large buffer, for example
+
+	size_t len = read (fd, buf, BUF_LEN);
+
+Will return as many events as are available and fit in BUF_LEN.
+
+each inotify instance fd is also select()- and poll()-able.
+
+You can find the size of the current event queue via the FIONREAD ioctl.
+
+All watches are destroyed and cleaned up on close.
+
+
+(ii) Internal Kernel Implementation
+
+Each open inotify instance is associated with an inotify_device structure.
+
+Each watch is associated with an inotify_watch structure.  Watches are chained
+off of each associated device and each associated inode.
+
+See fs/inotify.c for the locking and lifetime rules.
+
+
+(iii) Rationale
+
+Q: What is the design decision behind not tying the watch to the open fd of
+   the watched object?
+
+A: Watches are associated with an open inotify device, not an open file.
+   This solves the primary problem with dnotify: keeping the file open pins
+   the file and thus, worse, pins the mount.  Dnotify is therefore infeasible
+   for use on a desktop system with removable media as the media cannot be
+   unmounted.
+
+Q: What is the design decision behind using an-fd-per-device as opposed to
+   an fd-per-watch?
+
+A: An fd-per-watch quickly consumes more file descriptors than are allowed,
+   more fd's than are feasible to manage, and more fd's than are optimally
+   select()-able.  Yes, root can bump the per-process fd limit and yes, users
+   can use epoll, but requiring both is a silly and extraneous requirement.
+   A watch consumes less memory than an open file, separating the number
+   spaces is thus sensible.  The current design is what user-space developers
+   want: Users initialize inotify, once, and add n watches, requiring but one fd
+   and no twiddling with fd limits.  Initializing an inotify instance two
+   thousand times is silly.  If we can implement user-space's preferences 
+   cleanly--and we can, the idr layer makes stuff like this trivial--then we 
+   should.
+
+   There are other good arguments.  With a single fd, there is a single
+   item to block on, which is mapped to a single queue of events.  The single
+   fd returns all watch events and also any potential out-of-band data.  If
+   every fd was a separate watch,
+
+   - There would be no way to get event ordering.  Events on file foo and
+     file bar would pop poll() on both fd's, but there would be no way to tell
+     which happened first.  A single queue trivially gives you ordering.  Such
+     ordering is crucial to existing applications such as Beagle.  Imagine
+     "mv a b ; mv b a" events without ordering.
+
+   - We'd have to maintain n fd's and n internal queues with state,
+     versus just one.  It is a lot messier in the kernel.  A single, linear
+     queue is the data structure that makes sense.
+
+   - User-space developers prefer the current API.  The Beagle guys, for
+     example, love it.  Trust me, I asked.  It is not a surprise: Who'd want
+     to manage and block on 1000 fd's via select?
+
+   - You'd have to manage the fd's, as an example: Call close() when you
+     received a delete event.
+
+   - No way to get out of band data.
+
+   - 1024 is still too low.  ;-)
+
+   When you talk about designing a file change notification system that
+   scales to 1000s of directories, juggling 1000s of fd's just does not seem
+   the right interface.  It is too heavy.
+
+Q: Why the system call approach?
+
+A: The poor user-space interface is the second biggest problem with dnotify.
+   Signals are a terrible, terrible interface for file notification.  Or for
+   anything, for that matter.  The ideal solution, from all perspectives, is a
+   file descriptor-based one that allows basic file I/O and poll/select.
+   Obtaining the fd and managing the watches could have been done either via a
+   device file or a family of new system calls.  We decided to implement a
+   family of system calls because that is the preffered approach for new kernel
+   features and it means our user interface requirements.
+
+   Additionally, it _is_ possible to  more than one instance  and
+   juggle more than one queue and thus more than one associated fd.
+
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 3db9a04aec6e..468500a7e894 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -291,3 +291,6 @@ ENTRY(sys_call_table)
 	.long sys_keyctl
 	.long sys_ioprio_set
 	.long sys_ioprio_get		/* 290 */
+	.long sys_inotify_init
+	.long sys_inotify_add_watch
+	.long sys_inotify_rm_watch
diff --git a/fs/Kconfig b/fs/Kconfig
index f93fd41b025d..5d0c4be43dba 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -359,6 +359,19 @@ config ROMFS_FS
 	  If you don't know whether you need it, then you don't need it:
 	  answer N.
 
+config INOTIFY
+	bool "Inotify file change notification support"
+	default y
+	---help---
+	  Say Y here to enable inotify support and the /dev/inotify character
+	  device.  Inotify is a file change notification system and a
+	  replacement for dnotify.  Inotify fixes numerous shortcomings in
+	  dnotify and introduces several new features.  It allows monitoring
+	  of both files and directories via a single open fd.  Multiple file
+	  events are supported.
+
+	  If unsure, say Y.
+
 config QUOTA
 	bool "Quota support"
 	help
diff --git a/fs/Makefile b/fs/Makefile
index 20edcf28bfd2..cf95eb894fd5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -12,6 +12,7 @@ obj-y :=	open.o read_write.o file_table.o buffer.o  bio.o super.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
 		ioprio.o
 
+obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_COMPAT)		+= compat.o
 
diff --git a/fs/attr.c b/fs/attr.c
index c3c76fe78346..b1796fb9e524 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -10,7 +10,7 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/smp_lock.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/fcntl.h>
 #include <linux/quotaops.h>
 #include <linux/security.h>
@@ -107,31 +107,8 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
 out:
 	return error;
 }
-
 EXPORT_SYMBOL(inode_setattr);
 
-int setattr_mask(unsigned int ia_valid)
-{
-	unsigned long dn_mask = 0;
-
-	if (ia_valid & ATTR_UID)
-		dn_mask |= DN_ATTRIB;
-	if (ia_valid & ATTR_GID)
-		dn_mask |= DN_ATTRIB;
-	if (ia_valid & ATTR_SIZE)
-		dn_mask |= DN_MODIFY;
-	/* both times implies a utime(s) call */
-	if ((ia_valid & (ATTR_ATIME|ATTR_MTIME)) == (ATTR_ATIME|ATTR_MTIME))
-		dn_mask |= DN_ATTRIB;
-	else if (ia_valid & ATTR_ATIME)
-		dn_mask |= DN_ACCESS;
-	else if (ia_valid & ATTR_MTIME)
-		dn_mask |= DN_MODIFY;
-	if (ia_valid & ATTR_MODE)
-		dn_mask |= DN_ATTRIB;
-	return dn_mask;
-}
-
 int notify_change(struct dentry * dentry, struct iattr * attr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -197,11 +174,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
 	if (ia_valid & ATTR_SIZE)
 		up_write(&dentry->d_inode->i_alloc_sem);
 
-	if (!error) {
-		unsigned long dn_mask = setattr_mask(ia_valid);
-		if (dn_mask)
-			dnotify_parent(dentry, dn_mask);
-	}
+	if (!error)
+		fsnotify_change(dentry, ia_valid);
+
 	return error;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 728cd8365384..6b06b6bae35e 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -37,7 +37,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/dirent.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/highuid.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -1307,9 +1307,13 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 out:
 	if (iov != iovstack)
 		kfree(iov);
-	if ((ret + (type == READ)) > 0)
-		dnotify_parent(file->f_dentry,
-				(type == READ) ? DN_ACCESS : DN_MODIFY);
+	if ((ret + (type == READ)) > 0) {
+		struct dentry *dentry = file->f_dentry;
+		if (type == READ)
+			fsnotify_access(dentry);
+		else
+			fsnotify_modify(dentry);
+	}
 	return ret;
 }
 
diff --git a/fs/file_table.c b/fs/file_table.c
index fa7849fae134..1d3de78e6bc9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -16,6 +16,7 @@
 #include <linux/eventpoll.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
+#include <linux/fsnotify.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
@@ -126,6 +127,8 @@ void fastcall __fput(struct file *file)
 	struct inode *inode = dentry->d_inode;
 
 	might_sleep();
+
+	fsnotify_close(file);
 	/*
 	 * The function eventpoll_release() should be the first called
 	 * in the file cleanup chain.
diff --git a/fs/inode.c b/fs/inode.c
index 5bc97507eeaa..96364fae0844 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
+#include <linux/inotify.h>
 
 /*
  * This is needed for the following functions:
@@ -202,6 +203,10 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	spin_lock_init(&inode->i_lock);
 	i_size_ordered_init(inode);
+#ifdef CONFIG_INOTIFY
+	INIT_LIST_HEAD(&inode->inotify_watches);
+	sema_init(&inode->inotify_sem, 1);
+#endif
 }
 
 EXPORT_SYMBOL(inode_init_once);
@@ -351,6 +356,7 @@ int invalidate_inodes(struct super_block * sb)
 
 	down(&iprune_sem);
 	spin_lock(&inode_lock);
+	inotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
diff --git a/fs/inotify.c b/fs/inotify.c
new file mode 100644
index 000000000000..e423bfe0c86f
--- /dev/null
+++ b/fs/inotify.c
@@ -0,0 +1,999 @@
+/*
+ * fs/inotify.c - inode-based file event notifications
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/writeback.h>
+#include <linux/inotify.h>
+
+#include <asm/ioctls.h>
+
+static atomic_t inotify_cookie;
+
+static kmem_cache_t *watch_cachep;
+static kmem_cache_t *event_cachep;
+
+static struct vfsmount *inotify_mnt;
+
+/* These are configurable via /proc/sys/inotify */
+int inotify_max_user_devices;
+int inotify_max_user_watches;
+int inotify_max_queued_events;
+
+/*
+ * Lock ordering:
+ *
+ * dentry->d_lock (used to keep d_move() away from dentry->d_parent)
+ * iprune_sem (synchronize shrink_icache_memory())
+ * 	inode_lock (protects the super_block->s_inodes list)
+ * 	inode->inotify_sem (protects inode->inotify_watches and watches->i_list)
+ * 		inotify_dev->sem (protects inotify_device and watches->d_list)
+ */
+
+/*
+ * Lifetimes of the three main data structures--inotify_device, inode, and
+ * inotify_watch--are managed by reference count.
+ *
+ * inotify_device: Lifetime is from open until release.  Additional references
+ * can bump the count via get_inotify_dev() and drop the count via
+ * put_inotify_dev().
+ *
+ * inotify_watch: Lifetime is from create_watch() to destory_watch().
+ * Additional references can bump the count via get_inotify_watch() and drop
+ * the count via put_inotify_watch().
+ *
+ * inode: Pinned so long as the inode is associated with a watch, from
+ * create_watch() to put_inotify_watch().
+ */
+
+/*
+ * struct inotify_device - represents an open instance of an inotify device
+ *
+ * This structure is protected by the semaphore 'sem'.
+ */
+struct inotify_device {
+	wait_queue_head_t 	wq;		/* wait queue for i/o */
+	struct idr		idr;		/* idr mapping wd -> watch */
+	struct semaphore	sem;		/* protects this bad boy */
+	struct list_head 	events;		/* list of queued events */
+	struct list_head	watches;	/* list of watches */
+	atomic_t		count;		/* reference count */
+	struct user_struct	*user;		/* user who opened this dev */
+	unsigned int		queue_size;	/* size of the queue (bytes) */
+	unsigned int		event_count;	/* number of pending events */
+	unsigned int		max_events;	/* maximum number of events */
+};
+
+/*
+ * struct inotify_kernel_event - An inotify event, originating from a watch and
+ * queued for user-space.  A list of these is attached to each instance of the
+ * device.  In read(), this list is walked and all events that can fit in the
+ * buffer are returned.
+ *
+ * Protected by dev->sem of the device in which we are queued.
+ */
+struct inotify_kernel_event {
+	struct inotify_event	event;	/* the user-space event */
+	struct list_head        list;	/* entry in inotify_device's list */
+	char			*name;	/* filename, if any */
+};
+
+/*
+ * struct inotify_watch - represents a watch request on a specific inode
+ *
+ * d_list is protected by dev->sem of the associated watch->dev.
+ * i_list and mask are protected by inode->inotify_sem of the associated inode.
+ * dev, inode, and wd are never written to once the watch is created.
+ */
+struct inotify_watch {
+	struct list_head	d_list;	/* entry in inotify_device's list */
+	struct list_head	i_list;	/* entry in inode's list */
+	atomic_t		count;	/* reference count */
+	struct inotify_device	*dev;	/* associated device */
+	struct inode		*inode;	/* associated inode */
+	s32 			wd;	/* watch descriptor */
+	u32			mask;	/* event mask for this watch */
+};
+
+static inline void get_inotify_dev(struct inotify_device *dev)
+{
+	atomic_inc(&dev->count);
+}
+
+static inline void put_inotify_dev(struct inotify_device *dev)
+{
+	if (atomic_dec_and_test(&dev->count)) {
+		atomic_dec(&dev->user->inotify_devs);
+		free_uid(dev->user);
+		kfree(dev);
+	}
+}
+
+static inline void get_inotify_watch(struct inotify_watch *watch)
+{
+	atomic_inc(&watch->count);
+}
+
+/*
+ * put_inotify_watch - decrements the ref count on a given watch.  cleans up
+ * the watch and its references if the count reaches zero.
+ */
+static inline void put_inotify_watch(struct inotify_watch *watch)
+{
+	if (atomic_dec_and_test(&watch->count)) {
+		put_inotify_dev(watch->dev);
+		iput(watch->inode);
+		kmem_cache_free(watch_cachep, watch);
+	}
+}
+
+/*
+ * kernel_event - create a new kernel event with the given parameters
+ *
+ * This function can sleep.
+ */
+static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
+						  const char *name)
+{
+	struct inotify_kernel_event *kevent;
+
+	kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL);
+	if (unlikely(!kevent))
+		return NULL;
+
+	/* we hand this out to user-space, so zero it just in case */
+	memset(&kevent->event, 0, sizeof(struct inotify_event));
+
+	kevent->event.wd = wd;
+	kevent->event.mask = mask;
+	kevent->event.cookie = cookie;
+
+	INIT_LIST_HEAD(&kevent->list);
+
+	if (name) {
+		size_t len, rem, event_size = sizeof(struct inotify_event);
+
+		/*
+		 * We need to pad the filename so as to properly align an
+		 * array of inotify_event structures.  Because the structure is
+		 * small and the common case is a small filename, we just round
+		 * up to the next multiple of the structure's sizeof.  This is
+		 * simple and safe for all architectures.
+		 */
+		len = strlen(name) + 1;
+		rem = event_size - len;
+		if (len > event_size) {
+			rem = event_size - (len % event_size);
+			if (len % event_size == 0)
+				rem = 0;
+		}
+
+		kevent->name = kmalloc(len + rem, GFP_KERNEL);
+		if (unlikely(!kevent->name)) {
+			kmem_cache_free(event_cachep, kevent);
+			return NULL;
+		}
+		memcpy(kevent->name, name, len);
+		if (rem)
+			memset(kevent->name + len, 0, rem);		
+		kevent->event.len = len + rem;
+	} else {
+		kevent->event.len = 0;
+		kevent->name = NULL;
+	}
+
+	return kevent;
+}
+
+/*
+ * inotify_dev_get_event - return the next event in the given dev's queue
+ *
+ * Caller must hold dev->sem.
+ */
+static inline struct inotify_kernel_event *
+inotify_dev_get_event(struct inotify_device *dev)
+{
+	return list_entry(dev->events.next, struct inotify_kernel_event, list);
+}
+
+/*
+ * inotify_dev_queue_event - add a new event to the given device
+ *
+ * Caller must hold dev->sem.  Can sleep (calls kernel_event()).
+ */
+static void inotify_dev_queue_event(struct inotify_device *dev,
+				    struct inotify_watch *watch, u32 mask,
+				    u32 cookie, const char *name)
+{
+	struct inotify_kernel_event *kevent, *last;
+
+	/* coalescing: drop this event if it is a dupe of the previous */
+	last = inotify_dev_get_event(dev);
+	if (last && last->event.mask == mask && last->event.wd == watch->wd &&
+			last->event.cookie == cookie) {
+		const char *lastname = last->name;
+
+		if (!name && !lastname)
+			return;
+		if (name && lastname && !strcmp(lastname, name))
+			return;
+	}
+
+	/* the queue overflowed and we already sent the Q_OVERFLOW event */
+	if (unlikely(dev->event_count > dev->max_events))
+		return;
+
+	/* if the queue overflows, we need to notify user space */
+	if (unlikely(dev->event_count == dev->max_events))
+		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
+	else
+		kevent = kernel_event(watch->wd, mask, cookie, name);
+
+	if (unlikely(!kevent))
+		return;
+
+	/* queue the event and wake up anyone waiting */
+	dev->event_count++;
+	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
+	list_add_tail(&kevent->list, &dev->events);
+	wake_up_interruptible(&dev->wq);
+}
+
+/*
+ * remove_kevent - cleans up and ultimately frees the given kevent
+ *
+ * Caller must hold dev->sem.
+ */
+static void remove_kevent(struct inotify_device *dev,
+			  struct inotify_kernel_event *kevent)
+{
+	list_del(&kevent->list);
+
+	dev->event_count--;
+	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
+
+	kfree(kevent->name);
+	kmem_cache_free(event_cachep, kevent);
+}
+
+/*
+ * inotify_dev_event_dequeue - destroy an event on the given device
+ *
+ * Caller must hold dev->sem.
+ */
+static void inotify_dev_event_dequeue(struct inotify_device *dev)
+{
+	if (!list_empty(&dev->events)) {
+		struct inotify_kernel_event *kevent;
+		kevent = inotify_dev_get_event(dev);
+		remove_kevent(dev, kevent);
+	}
+}
+
+/*
+ * inotify_dev_get_wd - returns the next WD for use by the given dev
+ *
+ * Callers must hold dev->sem.  This function can sleep.
+ */
+static int inotify_dev_get_wd(struct inotify_device *dev,
+			      struct inotify_watch *watch)
+{
+	int ret;
+
+	do {
+		if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL)))
+			return -ENOSPC;
+		ret = idr_get_new(&dev->idr, watch, &watch->wd);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+/*
+ * find_inode - resolve a user-given path to a specific inode and return a nd
+ */
+static int find_inode(const char __user *dirname, struct nameidata *nd)
+{
+	int error;
+
+	error = __user_walk(dirname, LOOKUP_FOLLOW, nd);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = permission(nd->dentry->d_inode, MAY_READ, NULL);
+	if (error) 
+		path_release (nd);
+	return error;
+}
+
+/*
+ * create_watch - creates a watch on the given device.
+ *
+ * Callers must hold dev->sem.  Calls inotify_dev_get_wd() so may sleep.
+ * Both 'dev' and 'inode' (by way of nameidata) need to be pinned.
+ */
+static struct inotify_watch *create_watch(struct inotify_device *dev,
+					  u32 mask, struct inode *inode)
+{
+	struct inotify_watch *watch;
+	int ret;
+
+	if (atomic_read(&dev->user->inotify_watches) >= inotify_max_user_watches)
+		return ERR_PTR(-ENOSPC);
+
+	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
+	if (unlikely(!watch))
+		return ERR_PTR(-ENOMEM);
+
+	ret = inotify_dev_get_wd(dev, watch);
+	if (unlikely(ret)) {
+		kmem_cache_free(watch_cachep, watch);
+		return ERR_PTR(ret);
+	}
+
+	watch->mask = mask;
+	atomic_set(&watch->count, 0);
+	INIT_LIST_HEAD(&watch->d_list);
+	INIT_LIST_HEAD(&watch->i_list);
+
+	/* save a reference to device and bump the count to make it official */
+	get_inotify_dev(dev);
+	watch->dev = dev;
+
+	/*
+	 * Save a reference to the inode and bump the ref count to make it
+	 * official.  We hold a reference to nameidata, which makes this safe.
+	 */
+	watch->inode = igrab(inode);
+
+	/* bump our own count, corresponding to our entry in dev->watches */
+	get_inotify_watch(watch);
+
+	atomic_inc(&dev->user->inotify_watches);
+
+	return watch;
+}
+
+/*
+ * inotify_find_dev - find the watch associated with the given inode and dev
+ *
+ * Callers must hold inode->inotify_sem.
+ */
+static struct inotify_watch *inode_find_dev(struct inode *inode,
+					    struct inotify_device *dev)
+{
+	struct inotify_watch *watch;
+
+	list_for_each_entry(watch, &inode->inotify_watches, i_list) {
+		if (watch->dev == dev)
+			return watch;
+	}
+
+	return NULL;
+}
+
+/*
+ * remove_watch_no_event - remove_watch() without the IN_IGNORED event.
+ */
+static void remove_watch_no_event(struct inotify_watch *watch,
+				  struct inotify_device *dev)
+{
+	list_del(&watch->i_list);
+	list_del(&watch->d_list);
+
+	atomic_dec(&dev->user->inotify_watches);
+	idr_remove(&dev->idr, watch->wd);
+	put_inotify_watch(watch);
+}
+
+/*
+ * remove_watch - Remove a watch from both the device and the inode.  Sends
+ * the IN_IGNORED event to the given device signifying that the inode is no
+ * longer watched.
+ *
+ * Callers must hold both inode->inotify_sem and dev->sem.  We drop a
+ * reference to the inode before returning.
+ *
+ * The inode is not iput() so as to remain atomic.  If the inode needs to be
+ * iput(), the call returns one.  Otherwise, it returns zero.
+ */
+static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev)
+{
+	inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL);
+	remove_watch_no_event(watch, dev);
+}
+
+/*
+ * inotify_inode_watched - returns nonzero if there are watches on this inode
+ * and zero otherwise.  We call this lockless, we do not care if we race.
+ */
+static inline int inotify_inode_watched(struct inode *inode)
+{
+	return !list_empty(&inode->inotify_watches);
+}
+
+/* Kernel API */
+
+/**
+ * inotify_inode_queue_event - queue an event to all watches on this inode
+ * @inode: inode event is originating from
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ */
+void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie,
+			       const char *name)
+{
+	struct inotify_watch *watch, *next;
+
+	if (!inotify_inode_watched(inode))
+		return;
+
+	down(&inode->inotify_sem);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		u32 watch_mask = watch->mask;
+		if (watch_mask & mask) {
+			struct inotify_device *dev = watch->dev;
+			get_inotify_watch(watch);
+			down(&dev->sem);
+			inotify_dev_queue_event(dev, watch, mask, cookie, name);
+			if (watch_mask & IN_ONESHOT)
+				remove_watch_no_event(watch, dev);
+			up(&dev->sem);
+			put_inotify_watch(watch);
+		}
+	}
+	up(&inode->inotify_sem);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_queue_event);
+
+/**
+ * inotify_dentry_parent_queue_event - queue an event to a dentry's parent
+ * @dentry: the dentry in question, we queue against this dentry's parent
+ * @mask: event mask describing this event
+ * @cookie: cookie for synchronization, or zero
+ * @name: filename, if any
+ */
+void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
+				       u32 cookie, const char *name)
+{
+	struct dentry *parent;
+	struct inode *inode;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	inode = parent->d_inode;
+
+	if (inotify_inode_watched(inode)) {
+		dget(parent);
+		spin_unlock(&dentry->d_lock);
+		inotify_inode_queue_event(inode, mask, cookie, name);
+		dput(parent);
+	} else
+		spin_unlock(&dentry->d_lock);
+}
+EXPORT_SYMBOL_GPL(inotify_dentry_parent_queue_event);
+
+/**
+ * inotify_get_cookie - return a unique cookie for use in synchronizing events.
+ */
+u32 inotify_get_cookie(void)
+{
+	return atomic_inc_return(&inotify_cookie);
+}
+EXPORT_SYMBOL_GPL(inotify_get_cookie);
+
+/**
+ * inotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_sem held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void inotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inotify_watch *watch, *next_w;
+		struct inode *need_iput_tmp;
+		struct list_head *watches;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING, or
+		 * I_WILL_FREE which is fine because by that point the inode
+		 * cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+		/* In case the remove_watch() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+				atomic_read(&next_i->i_count) &&
+				!(next_i->i_state & (I_CLEAR | I_FREEING |
+					I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_sem keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send IN_UNMOUNT and then remove it */
+		down(&inode->inotify_sem);
+		watches = &inode->inotify_watches;
+		list_for_each_entry_safe(watch, next_w, watches, i_list) {
+			struct inotify_device *dev = watch->dev;
+			down(&dev->sem);
+			inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL);
+			remove_watch(watch, dev);
+			up(&dev->sem);
+		}
+		up(&inode->inotify_sem);
+		iput(inode);		
+
+		spin_lock(&inode_lock);
+	}
+}
+EXPORT_SYMBOL_GPL(inotify_unmount_inodes);
+
+/**
+ * inotify_inode_is_dead - an inode has been deleted, cleanup any watches
+ * @inode: inode that is about to be removed
+ */
+void inotify_inode_is_dead(struct inode *inode)
+{
+	struct inotify_watch *watch, *next;
+
+	down(&inode->inotify_sem);
+	list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) {
+		struct inotify_device *dev = watch->dev;
+		down(&dev->sem);
+		remove_watch(watch, dev);
+		up(&dev->sem);
+	}
+	up(&inode->inotify_sem);
+}
+EXPORT_SYMBOL_GPL(inotify_inode_is_dead);
+
+/* Device Interface */
+
+static unsigned int inotify_poll(struct file *file, poll_table *wait)
+{
+	struct inotify_device *dev = file->private_data;
+	int ret = 0;
+
+	poll_wait(file, &dev->wq, wait);
+	down(&dev->sem);
+	if (!list_empty(&dev->events))
+		ret = POLLIN | POLLRDNORM;
+	up(&dev->sem);
+
+	return ret;
+}
+
+static ssize_t inotify_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	size_t event_size = sizeof (struct inotify_event);
+	struct inotify_device *dev;
+	char __user *start;
+	int ret;
+	DEFINE_WAIT(wait);
+
+	start = buf;
+	dev = file->private_data;
+
+	while (1) {
+		int events;
+
+		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
+
+		down(&dev->sem);
+		events = !list_empty(&dev->events);
+		up(&dev->sem);
+		if (events) {
+			ret = 0;
+			break;
+		}
+
+		if (file->f_flags & O_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		schedule();
+	}
+
+	finish_wait(&dev->wq, &wait);
+	if (ret)
+		return ret;
+
+	down(&dev->sem);
+	while (1) {
+		struct inotify_kernel_event *kevent;
+
+		ret = buf - start;
+		if (list_empty(&dev->events))
+			break;
+
+		kevent = inotify_dev_get_event(dev);
+		if (event_size + kevent->event.len > count)
+			break;
+
+		if (copy_to_user(buf, &kevent->event, event_size)) {
+			ret = -EFAULT;
+			break;
+		}
+		buf += event_size;
+		count -= event_size;
+
+		if (kevent->name) {
+			if (copy_to_user(buf, kevent->name, kevent->event.len)){
+				ret = -EFAULT;
+				break;
+			}
+			buf += kevent->event.len;
+			count -= kevent->event.len;
+		}
+
+		remove_kevent(dev, kevent);
+	}
+	up(&dev->sem);
+
+	return ret;
+}
+
+static int inotify_release(struct inode *ignored, struct file *file)
+{
+	struct inotify_device *dev = file->private_data;
+
+	/*
+	 * Destroy all of the watches on this device.  Unfortunately, not very
+	 * pretty.  We cannot do a simple iteration over the list, because we
+	 * do not know the inode until we iterate to the watch.  But we need to
+	 * hold inode->inotify_sem before dev->sem.  The following works.
+	 */
+	while (1) {
+		struct inotify_watch *watch;
+		struct list_head *watches;
+		struct inode *inode;
+
+		down(&dev->sem);
+		watches = &dev->watches;
+		if (list_empty(watches)) {
+			up(&dev->sem);
+			break;
+		}
+		watch = list_entry(watches->next, struct inotify_watch, d_list);
+		get_inotify_watch(watch);
+		up(&dev->sem);
+
+		inode = watch->inode;
+		down(&inode->inotify_sem);
+		down(&dev->sem);
+		remove_watch_no_event(watch, dev);
+		up(&dev->sem);
+		up(&inode->inotify_sem);
+		put_inotify_watch(watch);
+	}
+
+	/* destroy all of the events on this device */
+	down(&dev->sem);
+	while (!list_empty(&dev->events))
+		inotify_dev_event_dequeue(dev);
+	up(&dev->sem);
+
+	/* free this device: the put matching the get in inotify_open() */
+	put_inotify_dev(dev);
+
+	return 0;
+}
+
+/*
+ * inotify_ignore - handle the INOTIFY_IGNORE ioctl, asking that a given wd be
+ * removed from the device.
+ *
+ * Can sleep.
+ */
+static int inotify_ignore(struct inotify_device *dev, s32 wd)
+{
+	struct inotify_watch *watch;
+	struct inode *inode;
+
+	down(&dev->sem);
+	watch = idr_find(&dev->idr, wd);
+	if (unlikely(!watch)) {
+		up(&dev->sem);
+		return -EINVAL;
+	}
+	get_inotify_watch(watch);
+	inode = watch->inode;
+	up(&dev->sem);
+
+	down(&inode->inotify_sem);
+	down(&dev->sem);
+
+	/* make sure that we did not race */
+	watch = idr_find(&dev->idr, wd);
+	if (likely(watch))
+		remove_watch(watch, dev);
+
+	up(&dev->sem);
+	up(&inode->inotify_sem);
+	put_inotify_watch(watch);
+
+	return 0;
+}
+
+static long inotify_ioctl(struct file *file, unsigned int cmd,
+			  unsigned long arg)
+{
+	struct inotify_device *dev;
+	void __user *p;
+	int ret = -ENOTTY;
+
+	dev = file->private_data;
+	p = (void __user *) arg;
+
+	switch (cmd) {
+	case FIONREAD:
+		ret = put_user(dev->queue_size, (int __user *) p);
+		break;
+	}
+
+	return ret;
+}
+
+static struct file_operations inotify_fops = {
+	.poll           = inotify_poll,
+	.read           = inotify_read,
+	.release        = inotify_release,
+	.unlocked_ioctl = inotify_ioctl,
+	.compat_ioctl	= inotify_ioctl,
+};
+
+asmlinkage long sys_inotify_init(void)
+{
+	struct inotify_device *dev;
+	struct user_struct *user;
+	int ret = -ENOTTY;
+	int fd;
+	struct file *filp;
+
+	fd = get_unused_fd();
+	if (fd < 0) {
+		ret = fd;
+		goto out;
+	}
+
+	filp = get_empty_filp();
+	if (!filp) {
+		put_unused_fd(fd);
+		ret = -ENFILE;
+		goto out;
+	}
+	filp->f_op = &inotify_fops;
+	filp->f_vfsmnt = mntget(inotify_mnt);
+	filp->f_dentry = dget(inotify_mnt->mnt_root);
+	filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+	filp->f_mode = FMODE_READ;
+	filp->f_flags = O_RDONLY;
+
+	user = get_uid(current->user);
+
+	if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_devices)) {
+		ret = -EMFILE;
+		goto out_err;
+	}
+
+	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
+	if (unlikely(!dev)) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	idr_init(&dev->idr);
+	INIT_LIST_HEAD(&dev->events);
+	INIT_LIST_HEAD(&dev->watches);
+	init_waitqueue_head(&dev->wq);
+	sema_init(&dev->sem, 1);
+	dev->event_count = 0;
+	dev->queue_size = 0;
+	dev->max_events = inotify_max_queued_events;
+	dev->user = user;
+	atomic_set(&dev->count, 0);
+
+	get_inotify_dev(dev);
+	atomic_inc(&user->inotify_devs);
+
+	filp->private_data = dev;
+	fd_install (fd, filp);
+	return fd;
+out_err:
+	put_unused_fd (fd);
+	put_filp (filp);
+	free_uid(user);
+out:
+	return ret;
+}
+
+asmlinkage long sys_inotify_add_watch(int fd, const char *path, u32 mask)
+{
+	struct inotify_watch *watch, *old;
+	struct inode *inode;
+	struct inotify_device *dev;
+	struct nameidata nd;
+	struct file *filp;
+	int ret;
+
+	filp = fget(fd);
+	if (!filp)
+		return -EBADF;
+
+	dev = filp->private_data;
+
+	ret = find_inode ((const char __user*)path, &nd);
+	if (ret)
+		goto fput_and_out;
+
+	/* Held in place by reference in nd */
+	inode = nd.dentry->d_inode;
+
+	down(&inode->inotify_sem);
+	down(&dev->sem);
+
+	/* don't let user-space set invalid bits: we don't want flags set */
+	mask &= IN_ALL_EVENTS;
+	if (!mask) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Handle the case of re-adding a watch on an (inode,dev) pair that we
+	 * are already watching.  We just update the mask and return its wd.
+	 */
+	old = inode_find_dev(inode, dev);
+	if (unlikely(old)) {
+		old->mask = mask;
+		ret = old->wd;
+		goto out;
+	}
+
+	watch = create_watch(dev, mask, inode);
+	if (unlikely(IS_ERR(watch))) {
+		ret = PTR_ERR(watch);
+		goto out;
+	}
+
+	/* Add the watch to the device's and the inode's list */
+	list_add(&watch->d_list, &dev->watches);
+	list_add(&watch->i_list, &inode->inotify_watches);
+	ret = watch->wd;
+out:
+	path_release (&nd);
+	up(&dev->sem);
+	up(&inode->inotify_sem);
+fput_and_out:
+	fput(filp);
+	return ret;
+}
+
+asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+{
+	struct file *filp;
+	struct inotify_device *dev;
+	int ret;
+
+	filp = fget(fd);
+	if (!filp)
+		return -EBADF;
+	dev = filp->private_data;
+	ret = inotify_ignore (dev, wd);
+	fput(filp);
+	return ret;
+}
+
+static struct super_block *
+inotify_get_sb(struct file_system_type *fs_type, int flags,
+	       const char *dev_name, void *data)
+{
+    return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA);
+}
+
+static struct file_system_type inotify_fs_type = {
+    .name           = "inotifyfs",
+    .get_sb         = inotify_get_sb,
+    .kill_sb        = kill_anon_super,
+};
+
+/*
+ * inotify_init - Our initialization function.  Note that we cannnot return
+ * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
+ * must result in panic().
+ */
+static int __init inotify_init(void)
+{
+	register_filesystem(&inotify_fs_type);
+	inotify_mnt = kern_mount(&inotify_fs_type);
+
+	inotify_max_queued_events = 8192;
+	inotify_max_user_devices = 128;
+	inotify_max_user_watches = 8192;
+
+	atomic_set(&inotify_cookie, 0);
+
+	watch_cachep = kmem_cache_create("inotify_watch_cache",
+					 sizeof(struct inotify_watch),
+					 0, SLAB_PANIC, NULL, NULL);
+	event_cachep = kmem_cache_create("inotify_event_cache",
+					 sizeof(struct inotify_kernel_event),
+					 0, SLAB_PANIC, NULL, NULL);
+
+	printk(KERN_INFO "inotify syscall\n");
+
+	return 0;
+}
+
+module_init(inotify_init);
diff --git a/fs/namei.c b/fs/namei.c
index 1d93cb4f7c5f..02a824cd3c5c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -21,7 +21,7 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/pagemap.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/smp_lock.h>
 #include <linux/personality.h>
 #include <linux/security.h>
@@ -1312,7 +1312,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	DQUOT_INIT(dir);
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error) {
-		inode_dir_notify(dir, DN_CREATE);
+		fsnotify_create(dir, dentry->d_name.name);
 		security_inode_post_create(dir, dentry, mode);
 	}
 	return error;
@@ -1637,7 +1637,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mknod(dir, dentry, mode, dev);
 	if (!error) {
-		inode_dir_notify(dir, DN_CREATE);
+		fsnotify_create(dir, dentry->d_name.name);
 		security_inode_post_mknod(dir, dentry, mode, dev);
 	}
 	return error;
@@ -1710,7 +1710,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	DQUOT_INIT(dir);
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error) {
-		inode_dir_notify(dir, DN_CREATE);
+		fsnotify_mkdir(dir, dentry->d_name.name);
 		security_inode_post_mkdir(dir,dentry, mode);
 	}
 	return error;
@@ -1801,7 +1801,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	}
 	up(&dentry->d_inode->i_sem);
 	if (!error) {
-		inode_dir_notify(dir, DN_DELETE);
+		fsnotify_rmdir(dentry, dentry->d_inode, dir);
 		d_delete(dentry);
 	}
 	dput(dentry);
@@ -1874,9 +1874,10 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
+		fsnotify_unlink(dentry, dir);
 		d_delete(dentry);
-		inode_dir_notify(dir, DN_DELETE);
 	}
+
 	return error;
 }
 
@@ -1950,7 +1951,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname, i
 	DQUOT_INIT(dir);
 	error = dir->i_op->symlink(dir, dentry, oldname);
 	if (!error) {
-		inode_dir_notify(dir, DN_CREATE);
+		fsnotify_create(dir, dentry->d_name.name);
 		security_inode_post_symlink(dir, dentry, oldname);
 	}
 	return error;
@@ -2023,7 +2024,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	error = dir->i_op->link(old_dentry, dir, new_dentry);
 	up(&old_dentry->d_inode->i_sem);
 	if (!error) {
-		inode_dir_notify(dir, DN_CREATE);
+		fsnotify_create(dir, new_dentry->d_name.name);
 		security_inode_post_link(old_dentry, dir, new_dentry);
 	}
 	return error;
@@ -2187,6 +2188,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 {
 	int error;
 	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
+	const char *old_name;
 
 	if (old_dentry->d_inode == new_dentry->d_inode)
  		return 0;
@@ -2208,18 +2210,18 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	DQUOT_INIT(old_dir);
 	DQUOT_INIT(new_dir);
 
+	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
+
 	if (is_dir)
 		error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
 	else
 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
 	if (!error) {
-		if (old_dir == new_dir)
-			inode_dir_notify(old_dir, DN_RENAME);
-		else {
-			inode_dir_notify(old_dir, DN_DELETE);
-			inode_dir_notify(new_dir, DN_CREATE);
-		}
+		const char *new_name = old_dentry->d_name.name;
+		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir);
 	}
+	fsnotify_oldname_free(old_name);
+
 	return error;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 5e0bf3917607..4f2cd3d27566 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -45,7 +45,7 @@
 #endif /* CONFIG_NFSD_V3 */
 #include <linux/nfsd/nfsfh.h>
 #include <linux/quotaops.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #ifdef CONFIG_NFSD_V4
@@ -860,7 +860,7 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 		nfsdstats.io_read += err;
 		*count = err;
 		err = 0;
-		dnotify_parent(file->f_dentry, DN_ACCESS);
+		fsnotify_access(file->f_dentry);
 	} else 
 		err = nfserrno(err);
 out:
@@ -916,7 +916,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 	set_fs(oldfs);
 	if (err >= 0) {
 		nfsdstats.io_write += cnt;
-		dnotify_parent(file->f_dentry, DN_MODIFY);
+		fsnotify_modify(file->f_dentry);
 	}
 
 	/* clear setuid/setgid flag after write */
diff --git a/fs/open.c b/fs/open.c
index 3f4a4286fdc4..32bf05e2996d 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -10,7 +10,7 @@
 #include <linux/file.h>
 #include <linux/smp_lock.h>
 #include <linux/quotaops.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/tty.h>
@@ -951,6 +951,7 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
 			} else {
+				fsnotify_open(f->f_dentry);
 				fd_install(fd, f);
 			}
 		}
diff --git a/fs/read_write.c b/fs/read_write.c
index 9292f5fa4d62..563abd09b5c8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -10,7 +10,7 @@
 #include <linux/file.h>
 #include <linux/uio.h>
 #include <linux/smp_lock.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
@@ -252,7 +252,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
 			else
 				ret = do_sync_read(file, buf, count, pos);
 			if (ret > 0) {
-				dnotify_parent(file->f_dentry, DN_ACCESS);
+				fsnotify_access(file->f_dentry);
 				current->rchar += ret;
 			}
 			current->syscr++;
@@ -303,7 +303,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
 			else
 				ret = do_sync_write(file, buf, count, pos);
 			if (ret > 0) {
-				dnotify_parent(file->f_dentry, DN_MODIFY);
+				fsnotify_modify(file->f_dentry);
 				current->wchar += ret;
 			}
 			current->syscw++;
@@ -539,9 +539,12 @@ static ssize_t do_readv_writev(int type, struct file *file,
 out:
 	if (iov != iovstack)
 		kfree(iov);
-	if ((ret + (type == READ)) > 0)
-		dnotify_parent(file->f_dentry,
-				(type == READ) ? DN_ACCESS : DN_MODIFY);
+	if ((ret + (type == READ)) > 0) {
+		if (type == READ)
+			fsnotify_access(file->f_dentry);
+		else
+			fsnotify_modify(file->f_dentry);
+	}
 	return ret;
 Efault:
 	ret = -EFAULT;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d72c1ce48559..335288b9be0f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -3,7 +3,7 @@
  */
 
 #include <linux/module.h>
-#include <linux/dnotify.h>
+#include <linux/fsnotify.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <asm/uaccess.h>
@@ -391,9 +391,6 @@ int sysfs_create_file(struct kobject * kobj, const struct attribute * attr)
  * sysfs_update_file - update the modified timestamp on an object attribute.
  * @kobj: object we're acting for.
  * @attr: attribute descriptor.
- *
- * Also call dnotify for the dentry, which lots of userspace programs
- * use.
  */
 int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 {
@@ -408,7 +405,7 @@ int sysfs_update_file(struct kobject * kobj, const struct attribute * attr)
 		if (victim->d_inode && 
 		    (victim->d_parent->d_inode == dir->d_inode)) {
 			victim->d_inode->i_mtime = CURRENT_TIME;
-			dnotify_parent(victim, DN_MODIFY);
+			fsnotify_modify(victim);
 
 			/**
 			 * Drop reference from initial sysfs_get_dentry().
diff --git a/fs/xattr.c b/fs/xattr.c
index 93dee70a1dbe..6acd5c63da91 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,6 +16,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/module.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 
 /*
@@ -57,8 +58,10 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
 		if (error)
 			goto out;
 		error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
-		if (!error)
+		if (!error) {
+			fsnotify_xattr(d);
 			security_inode_post_setxattr(d, kname, kvalue, size, flags);
+		}
 out:
 		up(&d->d_inode->i_sem);
 	}
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index e25e4c71a879..a7cb377745bf 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -296,8 +296,11 @@
 #define __NR_keyctl		288
 #define __NR_ioprio_set		289
 #define __NR_ioprio_get		290
+#define __NR_inotify_init	291
+#define __NR_inotify_add_watch	292
+#define __NR_inotify_rm_watch	293
 
-#define NR_syscalls 291
+#define NR_syscalls 294
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 302ec20838ca..c9bf3746a9fb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -474,6 +474,11 @@ struct inode {
 	struct dnotify_struct	*i_dnotify; /* for directory notifications */
 #endif
 
+#ifdef CONFIG_INOTIFY
+	struct list_head	inotify_watches; /* watches on this inode */
+	struct semaphore	inotify_sem;	/* protects the watches list */
+#endif
+
 	unsigned long		i_state;
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 
@@ -1393,7 +1398,6 @@ extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
 			 void *data, int force);
 extern sector_t bmap(struct inode *, sector_t);
-extern int setattr_mask(unsigned int);
 extern int notify_change(struct dentry *, struct iattr *);
 extern int permission(struct inode *, int, struct nameidata *);
 extern int generic_permission(struct inode *, int,
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
new file mode 100644
index 000000000000..eb581b6cfca9
--- /dev/null
+++ b/include/linux/fsnotify.h
@@ -0,0 +1,248 @@
+#ifndef _LINUX_FS_NOTIFY_H
+#define _LINUX_FS_NOTIFY_H
+
+/*
+ * include/linux/fsnotify.h - generic hooks for filesystem notification, to
+ * reduce in-source duplication from both dnotify and inotify.
+ *
+ * We don't compile any of this away in some complicated menagerie of ifdefs.
+ * Instead, we rely on the code inside to optimize away as needed.
+ *
+ * (C) Copyright 2005 Robert Love
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/dnotify.h>
+#include <linux/inotify.h>
+
+/*
+ * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
+ */
+static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
+				 const char *old_name, const char *new_name,
+				 int isdir)
+{
+	u32 cookie = inotify_get_cookie();
+
+	if (old_dir == new_dir)
+		inode_dir_notify(old_dir, DN_RENAME);
+	else {
+		inode_dir_notify(old_dir, DN_DELETE);
+		inode_dir_notify(new_dir, DN_CREATE);
+	}
+
+	if (isdir)
+		isdir = IN_ISDIR;
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name);
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name);
+}
+
+/*
+ * fsnotify_unlink - file was unlinked
+ */
+static inline void fsnotify_unlink(struct dentry *dentry, struct inode *dir)
+{
+	struct inode *inode = dentry->d_inode;
+
+	inode_dir_notify(dir, DN_DELETE);
+	inotify_inode_queue_event(dir, IN_DELETE, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
+
+	inotify_inode_is_dead(inode);
+}
+
+/*
+ * fsnotify_rmdir - directory was removed
+ */
+static inline void fsnotify_rmdir(struct dentry *dentry, struct inode *inode,
+				  struct inode *dir)
+{
+	inode_dir_notify(dir, DN_DELETE);
+	inotify_inode_queue_event(dir,IN_DELETE|IN_ISDIR,0,dentry->d_name.name);
+	inotify_inode_queue_event(inode, IN_DELETE_SELF | IN_ISDIR, 0, NULL);
+	inotify_inode_is_dead(inode);
+}
+
+/*
+ * fsnotify_create - 'name' was linked in
+ */
+static inline void fsnotify_create(struct inode *inode, const char *name)
+{
+	inode_dir_notify(inode, DN_CREATE);
+	inotify_inode_queue_event(inode, IN_CREATE, 0, name);
+}
+
+/*
+ * fsnotify_mkdir - directory 'name' was created
+ */
+static inline void fsnotify_mkdir(struct inode *inode, const char *name)
+{
+	inode_dir_notify(inode, DN_CREATE);
+	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, name);
+}
+
+/*
+ * fsnotify_access - file was read
+ */
+static inline void fsnotify_access(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	u32 mask = IN_ACCESS;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= IN_ISDIR;
+
+	dnotify_parent(dentry, DN_ACCESS);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, mask, 0, NULL);
+}
+
+/*
+ * fsnotify_modify - file was modified
+ */
+static inline void fsnotify_modify(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	u32 mask = IN_MODIFY;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= IN_ISDIR;
+
+	dnotify_parent(dentry, DN_MODIFY);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, mask, 0, NULL);
+}
+
+/*
+ * fsnotify_open - file was opened
+ */
+static inline void fsnotify_open(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	u32 mask = IN_OPEN;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= IN_ISDIR;
+
+	inotify_inode_queue_event(inode, mask, 0, NULL);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+}
+
+/*
+ * fsnotify_close - file was closed
+ */
+static inline void fsnotify_close(struct file *file)
+{
+	struct dentry *dentry = file->f_dentry;
+	struct inode *inode = dentry->d_inode;
+	const char *name = dentry->d_name.name;
+	mode_t mode = file->f_mode;
+	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= IN_ISDIR;
+
+	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
+	inotify_inode_queue_event(inode, mask, 0, NULL);
+}
+
+/*
+ * fsnotify_xattr - extended attributes were changed
+ */
+static inline void fsnotify_xattr(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	u32 mask = IN_ATTRIB;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= IN_ISDIR;
+
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, mask, 0, NULL);
+}
+
+/*
+ * fsnotify_change - notify_change event.  file was modified and/or metadata
+ * was changed.
+ */
+static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
+{
+	struct inode *inode = dentry->d_inode;
+	int dn_mask = 0;
+	u32 in_mask = 0;
+
+	if (ia_valid & ATTR_UID) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
+	if (ia_valid & ATTR_GID) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		in_mask |= IN_MODIFY;
+		dn_mask |= DN_MODIFY;
+	}
+	/* both times implies a utime(s) call */
+	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
+	{
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	} else if (ia_valid & ATTR_ATIME) {
+		in_mask |= IN_ACCESS;
+		dn_mask |= DN_ACCESS;
+	} else if (ia_valid & ATTR_MTIME) {
+		in_mask |= IN_MODIFY;
+		dn_mask |= DN_MODIFY;
+	}
+	if (ia_valid & ATTR_MODE) {
+		in_mask |= IN_ATTRIB;
+		dn_mask |= DN_ATTRIB;
+	}
+
+	if (dn_mask)
+		dnotify_parent(dentry, dn_mask);
+	if (in_mask) {
+		if (S_ISDIR(inode->i_mode))
+			in_mask |= IN_ISDIR;
+		inotify_inode_queue_event(inode, in_mask, 0, NULL);
+		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
+						  dentry->d_name.name);
+	}
+}
+
+#ifdef CONFIG_INOTIFY	/* inotify helpers */
+
+/*
+ * fsnotify_oldname_init - save off the old filename before we change it
+ */
+static inline const char *fsnotify_oldname_init(const char *name)
+{
+	return kstrdup(name, GFP_KERNEL);
+}
+
+/*
+ * fsnotify_oldname_free - free the name we got from fsnotify_oldname_init
+ */
+static inline void fsnotify_oldname_free(const char *old_name)
+{
+	kfree(old_name);
+}
+
+#else	/* CONFIG_INOTIFY */
+
+static inline const char *fsnotify_oldname_init(const char *name)
+{
+	return NULL;
+}
+
+static inline void fsnotify_oldname_free(const char *old_name)
+{
+}
+
+#endif	/* ! CONFIG_INOTIFY */
+
+#endif	/* __KERNEL__ */
+
+#endif	/* _LINUX_FS_NOTIFY_H */
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
new file mode 100644
index 000000000000..a40c2bf0408e
--- /dev/null
+++ b/include/linux/inotify.h
@@ -0,0 +1,108 @@
+/*
+ * Inode based directory notification for Linux
+ *
+ * Copyright (C) 2005 John McCutchan
+ */
+
+#ifndef _LINUX_INOTIFY_H
+#define _LINUX_INOTIFY_H
+
+#include <linux/types.h>
+
+/*
+ * struct inotify_event - structure read from the inotify device for each event
+ *
+ * When you are watching a directory, you will receive the filename for events
+ * such as IN_CREATE, IN_DELETE, IN_OPEN, IN_CLOSE, ..., relative to the wd.
+ */
+struct inotify_event {
+	__s32		wd;		/* watch descriptor */
+	__u32		mask;		/* watch mask */
+	__u32		cookie;		/* cookie to synchronize two events */
+	__u32		len;		/* length (including nulls) of name */
+	char		name[0];	/* stub for possible name */
+};
+
+/* the following are legal, implemented events that user-space can watch for */
+#define IN_ACCESS		0x00000001	/* File was accessed */
+#define IN_MODIFY		0x00000002	/* File was modified */
+#define IN_ATTRIB		0x00000004	/* Metadata changed */
+#define IN_CLOSE_WRITE		0x00000008	/* Writtable file was closed */
+#define IN_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
+#define IN_OPEN			0x00000020	/* File was opened */
+#define IN_MOVED_FROM		0x00000040	/* File was moved from X */
+#define IN_MOVED_TO		0x00000080	/* File was moved to Y */
+#define IN_CREATE		0x00000100	/* Subfile was created */
+#define IN_DELETE		0x00000200	/* Subfile was deleted */
+#define IN_DELETE_SELF		0x00000400	/* Self was deleted */
+
+/* the following are legal events.  they are sent as needed to any watch */
+#define IN_UNMOUNT		0x00002000	/* Backing fs was unmounted */
+#define IN_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+#define IN_IGNORED		0x00008000	/* File was ignored */
+
+/* helper events */
+#define IN_CLOSE		(IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) /* close */
+#define IN_MOVE			(IN_MOVED_FROM | IN_MOVED_TO) /* moves */
+
+/* special flags */
+#define IN_ISDIR		0x40000000	/* event occurred against dir */
+#define IN_ONESHOT		0x80000000	/* only send event once */
+
+/*
+ * All of the events - we build the list by hand so that we can add flags in
+ * the future and not break backward compatibility.  Apps will get only the
+ * events that they originally wanted.  Be sure to add new events here!
+ */
+#define IN_ALL_EVENTS	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
+			 IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
+			 IN_MOVED_TO | IN_DELETE | IN_CREATE | IN_DELETE_SELF)
+
+#ifdef __KERNEL__
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/config.h>
+
+#ifdef CONFIG_INOTIFY
+
+extern void inotify_inode_queue_event(struct inode *, __u32, __u32,
+				      const char *);
+extern void inotify_dentry_parent_queue_event(struct dentry *, __u32, __u32,
+					      const char *);
+extern void inotify_unmount_inodes(struct list_head *);
+extern void inotify_inode_is_dead(struct inode *);
+extern u32 inotify_get_cookie(void);
+
+#else
+
+static inline void inotify_inode_queue_event(struct inode *inode,
+					     __u32 mask, __u32 cookie,
+					     const char *filename)
+{
+}
+
+static inline void inotify_dentry_parent_queue_event(struct dentry *dentry,
+						     __u32 mask, __u32 cookie,
+						     const char *filename)
+{
+}
+
+static inline void inotify_unmount_inodes(struct list_head *list)
+{
+}
+
+static inline void inotify_inode_is_dead(struct inode *inode)
+{
+}
+
+static inline u32 inotify_get_cookie(void)
+{
+	return 0;
+}
+
+#endif	/* CONFIG_INOTIFY */
+
+#endif	/* __KERNEL __ */
+
+#endif	/* _LINUX_INOTIFY_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff48815bd3a2..dec5827c7742 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -410,6 +410,10 @@ struct user_struct {
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */
 	atomic_t sigpending;	/* How many pending signals does this user have? */
+#ifdef CONFIG_INOTIFY
+	atomic_t inotify_watches; /* How many inotify watches does this user have? */
+	atomic_t inotify_devs;	/* How many inotify devs does this user have opened? */
+#endif
 	/* protected by mq_lock	*/
 	unsigned long mq_bytes;	/* How many bytes can be allocated to mqueue? */
 	unsigned long locked_shm; /* How many pages of mlocked shm ? */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 5b5f434ac9a0..ce19a2aa0b21 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -61,7 +61,8 @@ enum
 	CTL_DEV=7,		/* Devices */
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
-	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
+	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
+	CTL_INOTIFY=11		/* Inotify */
 };
 
 /* CTL_BUS names: */
@@ -70,6 +71,14 @@ enum
 	CTL_BUS_ISA=1		/* ISA */
 };
 
+/* CTL_INOTIFY names: */
+enum
+{
+	INOTIFY_MAX_USER_DEVICES=1,	/* max number of inotify device instances per user */
+	INOTIFY_MAX_USER_WATCHES=2,	/* max number of inotify watches per user */
+	INOTIFY_MAX_QUEUED_EVENTS=3	/* Max number of queued events per inotify device instance */
+};
+
 /* CTL_KERN names: */
 enum
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 29196ce9b40f..42b40ae5eada 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -80,6 +80,9 @@ cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
 cond_syscall(sys_set_zone_reclaim);
+cond_syscall(sys_inotify_init);
+cond_syscall(sys_inotify_add_watch);
+cond_syscall(sys_inotify_rm_watch);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 270ee7fadbd8..b240e2cb86fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,12 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 
+#ifdef CONFIG_INOTIFY
+extern int inotify_max_user_devices;
+extern int inotify_max_user_watches;
+extern int inotify_max_queued_events;
+#endif
+
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
 extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
@@ -218,6 +224,7 @@ static ctl_table root_table[] = {
 		.mode		= 0555,
 		.child		= dev_table,
 	},
+
 	{ .ctl_name = 0 }
 };
 
@@ -959,6 +966,40 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_INOTIFY
+	{
+		.ctl_name	= INOTIFY_MAX_USER_DEVICES,
+		.procname	= "max_user_devices",
+		.data		= &inotify_max_user_devices,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero, 
+	},
+
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644, 
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec, 
+		.extra1		= &zero
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
@@ -968,7 +1009,7 @@ static ctl_table debug_table[] = {
 
 static ctl_table dev_table[] = {
 	{ .ctl_name = 0 }
-};  
+};
 
 extern void init_irq_proc (void);
 
diff --git a/kernel/user.c b/kernel/user.c
index 734575d55769..89e562feb1b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -120,6 +120,10 @@ struct user_struct * alloc_uid(uid_t uid)
 		atomic_set(&new->processes, 0);
 		atomic_set(&new->files, 0);
 		atomic_set(&new->sigpending, 0);
+#ifdef CONFIG_INOTIFY
+		atomic_set(&new->inotify_watches, 0);
+		atomic_set(&new->inotify_devs, 0);
+#endif
 
 		new->mq_bytes = 0;
 		new->locked_shm = 0;
-- 
cgit v1.2.3-59-g8ed1b


From 0399cb08c54708db231d616f106f64d920e0b723 Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Wed, 13 Jul 2005 12:38:18 -0400
Subject: [PATCH] inotify: move sysctl

This moves the inotify sysctl knobs to "/proc/sys/fs/inotify" from
"/proc/sys/fs".  Also some related cleanup.

Signed-off-by: Robert Love <rml@novell.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inotify.c           | 49 ++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/sysctl.h | 12 ++++++------
 kernel/sysctl.c        | 51 +++++++++++---------------------------------------
 3 files changed, 62 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inotify.c b/fs/inotify.c
index e423bfe0c86f..fb4803131423 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -45,8 +45,8 @@ static kmem_cache_t *event_cachep;
 
 static struct vfsmount *inotify_mnt;
 
-/* These are configurable via /proc/sys/inotify */
-int inotify_max_user_devices;
+/* these are configurable via /proc/sys/fs/inotify/ */
+int inotify_max_user_instances;
 int inotify_max_user_watches;
 int inotify_max_queued_events;
 
@@ -125,6 +125,47 @@ struct inotify_watch {
 	u32			mask;	/* event mask for this watch */
 };
 
+#ifdef CONFIG_SYSCTL
+
+#include <linux/sysctl.h>
+
+static int zero;
+
+ctl_table inotify_table[] = {
+	{
+		.ctl_name	= INOTIFY_MAX_USER_INSTANCES,
+		.procname	= "max_user_instances",
+		.data		= &inotify_max_user_instances,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
+		.procname	= "max_user_watches",
+		.data		= &inotify_max_user_watches,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero, 
+	},
+	{
+		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
+		.procname	= "max_queued_events",
+		.data		= &inotify_max_queued_events,
+		.maxlen		= sizeof(int),
+		.mode		= 0644, 
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec, 
+		.extra1		= &zero
+	},
+	{ .ctl_name = 0 }
+};
+#endif /* CONFIG_SYSCTL */
+
 static inline void get_inotify_dev(struct inotify_device *dev)
 {
 	atomic_inc(&dev->count);
@@ -842,7 +883,7 @@ asmlinkage long sys_inotify_init(void)
 
 	user = get_uid(current->user);
 
-	if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_devices)) {
+	if (unlikely(atomic_read(&user->inotify_devs) >= inotify_max_user_instances)) {
 		ret = -EMFILE;
 		goto out_err;
 	}
@@ -979,7 +1020,7 @@ static int __init inotify_init(void)
 	inotify_mnt = kern_mount(&inotify_fs_type);
 
 	inotify_max_queued_events = 8192;
-	inotify_max_user_devices = 128;
+	inotify_max_user_instances = 8;
 	inotify_max_user_watches = 8192;
 
 	atomic_set(&inotify_cookie, 0);
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index ce19a2aa0b21..bfbbe94b297d 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -61,8 +61,7 @@ enum
 	CTL_DEV=7,		/* Devices */
 	CTL_BUS=8,		/* Busses */
 	CTL_ABI=9,		/* Binary emulation */
-	CTL_CPU=10,		/* CPU stuff (speed scaling, etc) */
-	CTL_INOTIFY=11		/* Inotify */
+	CTL_CPU=10		/* CPU stuff (speed scaling, etc) */
 };
 
 /* CTL_BUS names: */
@@ -71,12 +70,12 @@ enum
 	CTL_BUS_ISA=1		/* ISA */
 };
 
-/* CTL_INOTIFY names: */
+/* /proc/sys/fs/inotify/ */
 enum
 {
-	INOTIFY_MAX_USER_DEVICES=1,	/* max number of inotify device instances per user */
-	INOTIFY_MAX_USER_WATCHES=2,	/* max number of inotify watches per user */
-	INOTIFY_MAX_QUEUED_EVENTS=3	/* Max number of queued events per inotify device instance */
+	INOTIFY_MAX_USER_INSTANCES=1,	/* max instances per user */
+	INOTIFY_MAX_USER_WATCHES=2,	/* max watches per user */
+	INOTIFY_MAX_QUEUED_EVENTS=3	/* max queued events per instance */
 };
 
 /* CTL_KERN names: */
@@ -685,6 +684,7 @@ enum
 	FS_XFS=17,	/* struct: control xfs parameters */
 	FS_AIO_NR=18,	/* current system-wide number of aio requests */
 	FS_AIO_MAX_NR=19,	/* system-wide maximum number of aio requests */
+	FS_INOTIFY=20,	/* inotify submenu */
 };
 
 /* /proc/sys/fs/quota/ */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b240e2cb86fc..e60b9c36f1f0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,12 +67,6 @@ extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
 extern int pid_max_min, pid_max_max;
 
-#ifdef CONFIG_INOTIFY
-extern int inotify_max_user_devices;
-extern int inotify_max_user_watches;
-extern int inotify_max_queued_events;
-#endif
-
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 int unknown_nmi_panic;
 extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
@@ -152,6 +146,9 @@ extern ctl_table random_table[];
 #ifdef CONFIG_UNIX98_PTYS
 extern ctl_table pty_table[];
 #endif
+#ifdef CONFIG_INOTIFY
+extern ctl_table inotify_table[];
+#endif
 
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 int sysctl_legacy_va_layout;
@@ -957,6 +954,14 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#ifdef CONFIG_INOTIFY
+	{
+		.ctl_name	= FS_INOTIFY,
+		.procname	= "inotify",
+		.mode		= 0555,
+		.child		= inotify_table,
+	},
+#endif	
 #endif
 	{
 		.ctl_name	= KERN_SETUID_DUMPABLE,
@@ -966,40 +971,6 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#ifdef CONFIG_INOTIFY
-	{
-		.ctl_name	= INOTIFY_MAX_USER_DEVICES,
-		.procname	= "max_user_devices",
-		.data		= &inotify_max_user_devices,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-	},
-
-	{
-		.ctl_name	= INOTIFY_MAX_USER_WATCHES,
-		.procname	= "max_user_watches",
-		.data		= &inotify_max_user_watches,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero, 
-	},
-
-	{
-		.ctl_name	= INOTIFY_MAX_QUEUED_EVENTS,
-		.procname	= "max_queued_events",
-		.data		= &inotify_max_queued_events,
-		.maxlen		= sizeof(int),
-		.mode		= 0644, 
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec, 
-		.extra1		= &zero
-	},
-#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 5995f16b4a464c8a57de7c9d5ddf4758dbacad41 Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Wed, 13 Jul 2005 12:38:39 -0400
Subject: [PATCH] inotify: event ordering

This rearranges the event ordering for "open" to be consistent with the
ordering of the other events.

Signed-off-by: Robert Love <rml@novell.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fsnotify.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index eb581b6cfca9..d07a92c94776 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -125,8 +125,8 @@ static inline void fsnotify_open(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= IN_ISDIR;
 
-	inotify_inode_queue_event(inode, mask, 0, NULL);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+	inotify_inode_queue_event(inode, mask, 0, NULL);	
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 88bd5121d635136e01369141367f315665534b3c Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Wed, 13 Jul 2005 01:10:44 -0700
Subject: [PATCH] Fix soft lockup due to NTFS: VFS part and explanation

Something has changed in the core kernel such that we now get concurrent
inode write outs, one e.g via pdflush and one via sys_sync or whatever.
This causes a nasty deadlock in ntfs.  The only clean solution
unfortunately requires a minor vfs api extension.

First the deadlock analysis:

Prerequisive knowledge: NTFS has a file $MFT (inode 0) loaded at mount
time.  The NTFS driver uses the page cache for storing the file contents as
usual.  More interestingly this file contains the table of on-disk inodes
as a sequence of MFT_RECORDs.  Thus NTFS driver accesses the on-disk inodes
by accessing the MFT_RECORDs in the page cache pages of the loaded inode
$MFT.

The situation: VFS inode X on a mounted ntfs volume is dirty.  For same
inode X, the ntfs_inode is dirty and thus corresponding on-disk inode,
which is as explained above in a dirty PAGE_CACHE_PAGE belonging to the
table of inodes ($MFT, inode 0).

What happens:

Process 1: sys_sync()/umount()/whatever...  calls __sync_single_inode() for
$MFT -> do_writepages() -> write_page for the dirty page containing the
on-disk inode X, the page is now locked -> ntfs_write_mst_block() which
clears PageUptodate() on the page to prevent anyone else getting hold of it
whilst it does the write out (this is necessary as the on-disk inode needs
"fixups" applied before the write to disk which are removed again after the
write and PageUptodate is then set again).  It then analyses the page
looking for dirty on-disk inodes and when it finds one it calls
ntfs_may_write_mft_record() to see if it is safe to write this on-disk
inode.  This then calls ilookup5() to check if the corresponding VFS inode
is in icache().  This in turn calls ifind() which waits on the inode lock
via wait_on_inode whilst holding the global inode_lock.

Process 2: pdflush results in a call to __sync_single_inode for the same
VFS inode X on the ntfs volume.  This locks the inode (I_LOCK) then calls
write-inode -> ntfs_write_inode -> map_mft_record() -> read_cache_page() of
the page (in page cache of table of inodes $MFT, inode 0) containing the
on-disk inode.  This page has PageUptodate() clear because of Process 1
(see above) so read_cache_page() blocks when tries to take the page lock
for the page so it can call ntfs_read_page().

Thus Process 1 is holding the page lock on the page containing the on-disk
inode X and it is waiting on the inode X to be unlocked in ifind() so it
can write the page out and then unlock the page.

And Process 2 is holding the inode lock on inode X and is waiting for the
page to be unlocked so it can call ntfs_readpage() or discover that
Process 1 set PageUptodate() again and use the page.

Thus we have a deadlock due to ifind() waiting on the inode lock.

The only sensible solution: NTFS does not care whether the VFS inode is
locked or not when it calls ilookup5() (it doesn't use the VFS inode at
all, it just uses it to find the corresponding ntfs_inode which is of
course attached to the VFS inode (both are one single struct); and it uses
the ntfs_inode which is subject to its own locking so I_LOCK is irrelevant)
hence we want a modified ilookup5_nowait() which is the same as ilookup5()
but it does not wait on the inode lock.

Without such functionality I would have to keep my own ntfs_inode cache in
the NTFS driver just so I can find ntfs_inodes independent of their VFS
inodes which would be slow, memory and cpu cycle wasting, and incredibly
stupid given the icache already exists in the VFS.

Below is a patch that does the ilookup5_nowait() implementation in
fs/inode.c and exports it.

ilookup5_nowait.diff:

Introduce ilookup5_nowait() which is basically the same as ilookup5() but
it does not wait on the inode's lock (i.e. it omits the wait_on_inode()
done in ifind()).

This is needed to avoid a nasty deadlock in NTFS.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/inode.c         | 45 +++++++++++++++++++++++++++++++++++++++------
 include/linux/fs.h |  3 +++
 2 files changed, 42 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 96364fae0844..e57f1724db3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -757,6 +757,7 @@ EXPORT_SYMBOL(igrab);
  * @head:       the head of the list to search
  * @test:	callback used for comparisons between inodes
  * @data:	opaque data pointer to pass to @test
+ * @wait:	if true wait for the inode to be unlocked, if false do not
  *
  * ifind() searches for the inode specified by @data in the inode
  * cache. This is a generalized version of ifind_fast() for file systems where
@@ -771,7 +772,7 @@ EXPORT_SYMBOL(igrab);
  */
 static inline struct inode *ifind(struct super_block *sb,
 		struct hlist_head *head, int (*test)(struct inode *, void *),
-		void *data)
+		void *data, const int wait)
 {
 	struct inode *inode;
 
@@ -780,7 +781,8 @@ static inline struct inode *ifind(struct super_block *sb,
 	if (inode) {
 		__iget(inode);
 		spin_unlock(&inode_lock);
-		wait_on_inode(inode);
+		if (likely(wait))
+			wait_on_inode(inode);
 		return inode;
 	}
 	spin_unlock(&inode_lock);
@@ -820,7 +822,7 @@ static inline struct inode *ifind_fast(struct super_block *sb,
 }
 
 /**
- * ilookup5 - search for an inode in the inode cache
+ * ilookup5_nowait - search for an inode in the inode cache
  * @sb:		super block of file system to search
  * @hashval:	hash value (usually inode number) to search for
  * @test:	callback used for comparisons between inodes
@@ -832,7 +834,38 @@ static inline struct inode *ifind_fast(struct super_block *sb,
  * identification of an inode.
  *
  * If the inode is in the cache, the inode is returned with an incremented
- * reference count.
+ * reference count.  Note, the inode lock is not waited upon so you have to be
+ * very careful what you do with the returned inode.  You probably should be
+ * using ilookup5() instead.
+ *
+ * Otherwise NULL is returned.
+ *
+ * Note, @test is called with the inode_lock held, so can't sleep.
+ */
+struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
+		int (*test)(struct inode *, void *), void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+
+	return ifind(sb, head, test, data, 0);
+}
+
+EXPORT_SYMBOL(ilookup5_nowait);
+
+/**
+ * ilookup5 - search for an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @test:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @test
+ *
+ * ilookup5() uses ifind() to search for the inode specified by @hashval and
+ * @data in the inode cache. This is a generalized version of ilookup() for
+ * file systems where the inode number is not sufficient for unique
+ * identification of an inode.
+ *
+ * If the inode is in the cache, the inode lock is waited upon and the inode is
+ * returned with an incremented reference count.
  *
  * Otherwise NULL is returned.
  *
@@ -843,7 +876,7 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 {
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 
-	return ifind(sb, head, test, data);
+	return ifind(sb, head, test, data, 1);
 }
 
 EXPORT_SYMBOL(ilookup5);
@@ -900,7 +933,7 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
 	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
 	struct inode *inode;
 
-	inode = ifind(sb, head, test, data);
+	inode = ifind(sb, head, test, data, 1);
 	if (inode)
 		return inode;
 	/*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c9bf3746a9fb..0f53e0124941 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1441,6 +1441,9 @@ extern int inode_needs_sync(struct inode *inode);
 extern void generic_delete_inode(struct inode *inode);
 extern void generic_drop_inode(struct inode *inode);
 
+extern struct inode *ilookup5_nowait(struct super_block *sb,
+		unsigned long hashval, int (*test)(struct inode *, void *),
+		void *data);
 extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
 		int (*test)(struct inode *, void *), void *data);
 extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
-- 
cgit v1.2.3-59-g8ed1b


From 068e1b94bbd268f375349f68531829c8b7c210bc Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 13 Jul 2005 01:10:46 -0700
Subject: [PATCH] s390: fadvise hint values.

Add special case for the POSIX_FADV_DONTNEED and POSIX_FADV_NOREUSE hint
values for s390-64.  The user space values in the s390-64 glibc headers for
these two defines have always been 6 and 7 instead of 4 and 5.  All 64 bit
applications therefore use the "wrong" values.  To get these applications
working without recompiling the kernel needs to accept the "wrong" values.
Since the values for s390-31 are 4 and 5 the compat wrapper for fadvise64
and fadvise64_64 need to rewrite the values for 31 bit system calls.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/compat_linux.c   | 38 ++++++++++++++++++++++++++++++++++++++
 arch/s390/kernel/compat_wrapper.S |  4 ++--
 include/linux/fadvise.h           | 10 ++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 614056222875..18610cea03a2 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -58,6 +58,7 @@
 #include <linux/compat.h>
 #include <linux/vfs.h>
 #include <linux/ptrace.h>
+#include <linux/fadvise.h>
 
 #include <asm/types.h>
 #include <asm/ipc.h>
@@ -1043,3 +1044,40 @@ sys32_timer_create(clockid_t which_clock, struct compat_sigevent *se32,
 
 	return ret;
 }
+
+/*
+ * 31 bit emulation wrapper functions for sys_fadvise64/fadvise64_64.
+ * These need to rewrite the advise values for POSIX_FADV_{DONTNEED,NOREUSE}
+ * because the 31 bit values differ from the 64 bit values.
+ */
+
+asmlinkage long
+sys32_fadvise64(int fd, loff_t offset, size_t len, int advise)
+{
+	if (advise == 4)
+		advise = POSIX_FADV_DONTNEED;
+	else if (advise == 5)
+		advise = POSIX_FADV_NOREUSE;
+	return sys_fadvise64(fd, offset, len, advise);
+}
+
+struct fadvise64_64_args {
+	int fd;
+	long long offset;
+	long long len;
+	int advice;
+};
+
+asmlinkage long
+sys32_fadvise64_64(struct fadvise64_64_args __user *args)
+{
+	struct fadvise64_64_args a;
+
+	if ( copy_from_user(&a, args, sizeof(a)) )
+		return -EFAULT;
+	if (a.advice == 4)
+		a.advice = POSIX_FADV_DONTNEED;
+	else if (a.advice == 5)
+		a.advice = POSIX_FADV_NOREUSE;
+	return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice);
+}
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index bf529739c8ab..799a98eac92d 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1251,12 +1251,12 @@ sys32_fadvise64_wrapper:
 	or	%r3,%r4			# get low word of 64bit loff_t
 	llgfr	%r4,%r5			# size_t (unsigned long)
 	lgfr	%r5,%r6			# int
-	jg	sys_fadvise64
+	jg	sys32_fadvise64
 
 	.globl	sys32_fadvise64_64_wrapper
 sys32_fadvise64_64_wrapper:
 	llgtr	%r2,%r2			# struct fadvise64_64_args *
-	jg	s390_fadvise64_64
+	jg	sys32_fadvise64_64
 
 	.globl	sys32_clock_settime_wrapper
 sys32_clock_settime_wrapper:
diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h
index 6fc656dfb93d..e8e747139b9a 100644
--- a/include/linux/fadvise.h
+++ b/include/linux/fadvise.h
@@ -5,7 +5,17 @@
 #define POSIX_FADV_RANDOM	1 /* Expect random page references.  */
 #define POSIX_FADV_SEQUENTIAL	2 /* Expect sequential page references.  */
 #define POSIX_FADV_WILLNEED	3 /* Will need these pages.  */
+
+/*
+ * The advise values for POSIX_FADV_DONTNEED and POSIX_ADV_NOREUSE
+ * for s390-64 differ from the values for the rest of the world.
+ */
+#if defined(__s390x__)
+#define POSIX_FADV_DONTNEED	6 /* Don't need these pages.  */
+#define POSIX_FADV_NOREUSE	7 /* Data will be accessed once.  */
+#else
 #define POSIX_FADV_DONTNEED	4 /* Don't need these pages.  */
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
+#endif
 
 #endif	/* FADVISE_H_INCLUDED */
-- 
cgit v1.2.3-59-g8ed1b


From ac96202ba096f8fc1ccaf45a2f159a52639ece29 Mon Sep 17 00:00:00 2001
From: Andrew Vasquez <andrew.vasquez@qlogic.com>
Date: Wed, 6 Jul 2005 10:30:16 -0700
Subject: [SCSI] qla2xxx: Add pci ids for new ISP types.

Add pci ids for new ISP types.

Move old definitions in local qla_def.h file to pci_ids.h as
well.

Signed-off-by: Andrew Vasquez <andrew.vasquez@qlogic.com>
Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
---
 drivers/scsi/qla2xxx/qla_def.h | 21 ---------------------
 include/linux/pci_ids.h        |  9 +++++++++
 2 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h
index 24e22dc2fe57..13901c24b9a0 100644
--- a/drivers/scsi/qla2xxx/qla_def.h
+++ b/drivers/scsi/qla2xxx/qla_def.h
@@ -41,27 +41,6 @@
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_cmnd.h>
 
-/* XXX(hch): move to pci_ids.h */
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP2300
-#define PCI_DEVICE_ID_QLOGIC_ISP2300	0x2300
-#endif
-
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP2312
-#define PCI_DEVICE_ID_QLOGIC_ISP2312	0x2312
-#endif
-
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP2322
-#define PCI_DEVICE_ID_QLOGIC_ISP2322	0x2322
-#endif
-
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP6312
-#define PCI_DEVICE_ID_QLOGIC_ISP6312	0x6312
-#endif
-
-#ifndef PCI_DEVICE_ID_QLOGIC_ISP6322
-#define PCI_DEVICE_ID_QLOGIC_ISP6322	0x6322
-#endif
-
 #if defined(CONFIG_SCSI_QLA21XX) || defined(CONFIG_SCSI_QLA21XX_MODULE)
 #define IS_QLA2100(ha)	((ha)->pdev->device == PCI_DEVICE_ID_QLOGIC_ISP2100)
 #else
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bf608808a60c..f4c2c393797d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -908,6 +908,15 @@
 #define PCI_DEVICE_ID_QLOGIC_ISP1022	0x1022
 #define PCI_DEVICE_ID_QLOGIC_ISP2100	0x2100
 #define PCI_DEVICE_ID_QLOGIC_ISP2200	0x2200
+#define PCI_DEVICE_ID_QLOGIC_ISP2300	0x2300
+#define PCI_DEVICE_ID_QLOGIC_ISP2312	0x2312
+#define PCI_DEVICE_ID_QLOGIC_ISP2322	0x2322
+#define PCI_DEVICE_ID_QLOGIC_ISP6312	0x6312
+#define PCI_DEVICE_ID_QLOGIC_ISP6322	0x6322
+#define PCI_DEVICE_ID_QLOGIC_ISP2422	0x2422
+#define PCI_DEVICE_ID_QLOGIC_ISP2432	0x2432
+#define PCI_DEVICE_ID_QLOGIC_ISP2512	0x2512
+#define PCI_DEVICE_ID_QLOGIC_ISP2522	0x2522
 
 #define PCI_VENDOR_ID_CYRIX		0x1078
 #define PCI_DEVICE_ID_CYRIX_5510	0x0000
-- 
cgit v1.2.3-59-g8ed1b


From 6a806c510de490318846b53bbfec463d02ca274b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Fri, 15 Jul 2005 03:56:35 -0700
Subject: [PATCH] md/raid1: clear bitmap when fullsync completes

We need to be careful differentiating between a resync of a complete array,
in which we can clear the bitmap, and a resync of a degraded array, in
which we cannot.

This patch cleans all that up.

Cc: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         |  9 ++++++---
 drivers/md/raid1.c          | 37 ++++++++++++++++++-------------------
 include/linux/raid/bitmap.h |  2 +-
 3 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 95980ad6b27b..0c2ed99a3832 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1345,7 +1345,8 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
 	}
 }
 
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks,
+			int degraded)
 {
 	bitmap_counter_t *bmc;
 	int rv;
@@ -1362,8 +1363,10 @@ int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks)
 			rv = 1;
 		else if (NEEDED(*bmc)) {
 			rv = 1;
-			*bmc |= RESYNC_MASK;
-			*bmc &= ~NEEDED_MASK;
+			if (!degraded) { /* don't set/clear bits if degraded */
+				*bmc |= RESYNC_MASK;
+				*bmc &= ~NEEDED_MASK;
+			}
 		}
 	}
 	spin_unlock_irq(&bitmap->lock);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index ff1dbec864af..5f253ee536bb 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1126,21 +1126,19 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 		 * only be one in raid1 resync.
 		 * We can find the current addess in mddev->curr_resync
 		 */
-		if (!conf->fullsync) {
-			if (mddev->curr_resync < max_sector)
-				bitmap_end_sync(mddev->bitmap,
-						mddev->curr_resync,
+		if (mddev->curr_resync < max_sector) /* aborted */
+			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 						&sync_blocks, 1);
-			bitmap_close_sync(mddev->bitmap);
-		}
-		if (mddev->curr_resync >= max_sector)
+		else /* completed sync */
 			conf->fullsync = 0;
+
+		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		return 0;
 	}
 
-	if (!conf->fullsync &&
-	    !bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks)) {
+	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) &&
+	    !conf->fullsync) {
 		/* We can skip this block, and probably several more */
 		*skipped = 1;
 		return sync_blocks;
@@ -1243,15 +1241,15 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
-		if (!conf->fullsync) {
-			if (sync_blocks == 0) {
-				if (!bitmap_start_sync(mddev->bitmap,
-						       sector_nr, &sync_blocks))
-					break;
-				if (sync_blocks < (PAGE_SIZE>>9))
-					BUG();
-				if (len > (sync_blocks<<9)) len = sync_blocks<<9;
-			}
+		if (sync_blocks == 0) {
+			if (!bitmap_start_sync(mddev->bitmap, sector_nr,
+					&sync_blocks, mddev->degraded) &&
+					!conf->fullsync)
+				break;
+			if (sync_blocks < (PAGE_SIZE>>9))
+				BUG();
+			if (len > (sync_blocks<<9))
+				len = sync_blocks<<9;
 		}
 
 		for (i=0 ; i < conf->raid_disks; i++) {
@@ -1264,7 +1262,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
-						if (bio->bi_end_io==NULL) continue;
+						if (bio->bi_end_io==NULL)
+							continue;
 						/* remove last page from this bio */
 						bio->bi_vcnt--;
 						bio->bi_size -= len;
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index e24b74b11150..6213e976eade 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -262,7 +262,7 @@ void bitmap_write_all(struct bitmap *bitmap);
 int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
 void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
 		     int success);
-int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int degraded);
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
 
-- 
cgit v1.2.3-59-g8ed1b


From 661f83a67c2e360d5a4d2406cc28379c909f94bf Mon Sep 17 00:00:00 2001
From: Russell King <rmk@dyn-67.arm.linux.org.uk>
Date: Sat, 16 Jul 2005 09:30:53 +0100
Subject: [PATCH] Serial: Move deprecation of register_serial forward to
 September

I think it's about time to make the build a little more vocal about the
expiry of these functions.  Due to recent discussions with problems in
the console initialisation vs power manglement, I'd like to move the
date forward to September.

Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 Documentation/feature-removal-schedule.txt | 4 ++--
 include/linux/serial.h                     | 6 ++++--
 include/linux/serial_core.h                | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 12dde43fe657..8b1430b46655 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -103,11 +103,11 @@ Who:	Jody McIntyre <scjody@steamballoon.com>
 ---------------------------
 
 What:	register_serial/unregister_serial
-When:	December 2005
+When:	September 2005
 Why:	This interface does not allow serial ports to be registered against
 	a struct device, and as such does not allow correct power management
 	of such ports.  8250-based ports should use serial8250_register_port
-	and serial8250_unregister_port instead.
+	and serial8250_unregister_port, or platform devices instead.
 Who:	Russell King <rmk@arm.linux.org.uk>
 
 ---------------------------
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 00145822fb74..9f2d85284d0b 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -174,9 +174,11 @@ struct serial_icounter_struct {
 
 
 #ifdef __KERNEL__
+#include <linux/compiler.h>
+
 /* Export to allow PCMCIA to use this - Dave Hinds */
-extern int register_serial(struct serial_struct *req);
-extern void unregister_serial(int line);
+extern int __deprecated register_serial(struct serial_struct *req);
+extern void __deprecated unregister_serial(int line);
 
 /* Allow architectures to override entries in serial8250_ports[] at run time: */
 struct uart_port;	/* forward declaration */
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index d6025af7efac..30b64f3534f4 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -122,6 +122,7 @@
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <linux/compiler.h>
 #include <linux/interrupt.h>
 #include <linux/circ_buf.h>
 #include <linux/spinlock.h>
@@ -359,8 +360,8 @@ struct tty_driver *uart_console_device(struct console *co, int *index);
  */
 int uart_register_driver(struct uart_driver *uart);
 void uart_unregister_driver(struct uart_driver *uart);
-void uart_unregister_port(struct uart_driver *reg, int line);
-int uart_register_port(struct uart_driver *reg, struct uart_port *port);
+void __deprecated uart_unregister_port(struct uart_driver *reg, int line);
+int __deprecated uart_register_port(struct uart_driver *reg, struct uart_port *port);
 int uart_add_one_port(struct uart_driver *reg, struct uart_port *port);
 int uart_remove_one_port(struct uart_driver *reg, struct uart_port *port);
 int uart_match_port(struct uart_port *port1, struct uart_port *port2);
-- 
cgit v1.2.3-59-g8ed1b


From 6d283d271674b1127881ebf082266a2c3fe6e0e4 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Sat, 16 Jul 2005 09:59:00 +0100
Subject: [PATCH] Serial: Remove linux/version.h

changing CONFIG_LOCALVERSION rebuilds too much, for no appearent reason.

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/crisv10.c | 2 --
 drivers/serial/icom.c    | 1 -
 drivers/serial/jsm/jsm.h | 1 -
 include/linux/serialP.h  | 1 -
 4 files changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c
index 3da5494953af..23b8871e74cc 100644
--- a/drivers/serial/crisv10.c
+++ b/drivers/serial/crisv10.c
@@ -426,8 +426,6 @@
 static char *serial_version = "$Revision: 1.25 $";
 
 #include <linux/config.h>
-#include <linux/version.h>
-
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index 546a0bc77e1e..c112b32764e8 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -25,7 +25,6 @@
 #define SERIAL_DO_RESTART
 #include <linux/module.h>
 #include <linux/config.h>
-#include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
diff --git a/drivers/serial/jsm/jsm.h b/drivers/serial/jsm/jsm.h
index 777829fa3300..5bf3c45521f4 100644
--- a/drivers/serial/jsm/jsm.h
+++ b/drivers/serial/jsm/jsm.h
@@ -28,7 +28,6 @@
 #define __JSM_DRIVER_H
 
 #include <linux/kernel.h>
-#include <linux/version.h>
 #include <linux/types.h>	/* To pick up the varions Linux types */
 #include <linux/tty.h>
 #include <linux/serial_core.h>
diff --git a/include/linux/serialP.h b/include/linux/serialP.h
index 2307f11d8a6b..2b2f35a64d75 100644
--- a/include/linux/serialP.h
+++ b/include/linux/serialP.h
@@ -19,7 +19,6 @@
  * For definitions of the flags field, see tty.h
  */
 
-#include <linux/version.h>
 #include <linux/config.h>
 #include <linux/termios.h>
 #include <linux/workqueue.h>
-- 
cgit v1.2.3-59-g8ed1b


From fbc0dc0df54be06586d712ebf6958816e3b1b2b7 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Mon, 18 Jul 2005 11:38:09 +0100
Subject: [PATCH] Serial: Add support for SIIG Quartet serial card

Add support for SIIG Quartet Serial card.  This card has Oxford
Semiconducor 16954 quad UART which is clocked by 10x faster
(18.432 MHz) quartz.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/serial/8250_pci.c | 14 ++++++++++++++
 include/linux/pci_ids.h   |  2 ++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index de54bdc5398b..c3f55f5a38de 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -389,6 +389,9 @@ static void __devexit sbs_exit(struct pci_dev *dev)
  *     - 10x cards have control registers in IO and/or memory space;
  *     - 20x cards have control registers in standard PCI configuration space.
  *
+ * There are also Quartet Serial cards which use Oxford Semiconductor
+ * 16954 quad UART PCI chip clocked by 18.432 MHz quartz.
+ *
  * Note: some SIIG cards are probed by the parport_serial object.
  */
 
@@ -1026,6 +1029,8 @@ enum pci_board_num_t {
 	pbn_b0_2_921600,
 	pbn_b0_4_921600,
 
+	pbn_b0_4_1152000,
+
 	pbn_b0_bt_1_115200,
 	pbn_b0_bt_2_115200,
 	pbn_b0_bt_8_115200,
@@ -1158,6 +1163,12 @@ static struct pci_board pci_boards[] __devinitdata = {
 		.base_baud	= 921600,
 		.uart_offset	= 8,
 	},
+	[pbn_b0_4_1152000] = {
+		.flags		= FL_BASE0,
+		.num_ports	= 4,
+		.base_baud	= 1152000,
+		.uart_offset	= 8,
+	},
 
 	[pbn_b0_bt_1_115200] = {
 		.flags		= FL_BASE0|FL_BASE_BARS,
@@ -1977,6 +1988,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 	{	PCI_VENDOR_ID_SPECIALIX, PCI_DEVICE_ID_OXSEMI_16PCI954,
 		PCI_VENDOR_ID_SPECIALIX, PCI_SUBDEVICE_ID_SPECIALIX_SPEED4, 0, 0,
 		pbn_b0_4_921600 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
+		PCI_SUBVENDOR_ID_SIIG, PCI_SUBDEVICE_ID_SIIG_QUARTET_SERIAL, 0, 0,
+		pbn_b0_4_1152000 },
 	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 27348c22dacb..9a28b312eeb4 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1872,6 +1872,7 @@
 #define PCI_DEVICE_ID_CBOARDS_DAS1602_16 0x0001
 
 #define PCI_VENDOR_ID_SIIG		0x131f
+#define PCI_SUBVENDOR_ID_SIIG		0x131f
 #define PCI_DEVICE_ID_SIIG_1S_10x_550	0x1000
 #define PCI_DEVICE_ID_SIIG_1S_10x_650	0x1001
 #define PCI_DEVICE_ID_SIIG_1S_10x_850	0x1002
@@ -1909,6 +1910,7 @@
 #define PCI_DEVICE_ID_SIIG_2S1P_20x_550	0x2060
 #define PCI_DEVICE_ID_SIIG_2S1P_20x_650	0x2061
 #define PCI_DEVICE_ID_SIIG_2S1P_20x_850	0x2062
+#define PCI_SUBDEVICE_ID_SIIG_QUARTET_SERIAL	0x2050
 
 #define PCI_VENDOR_ID_RADISYS		0x1331
 #define PCI_DEVICE_ID_RADISYS_ENP2611	0x0030
-- 
cgit v1.2.3-59-g8ed1b


From 23af27eb8fa9ea8614138c4cded7a16cb4197a55 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Mon, 18 Jul 2005 13:34:35 -0700
Subject: [PKT_SCHED]: Kill TCF_META_ID_TCCLASSID.

Thomas Graf states:

> I used to mark such ids as obsolete in the header but since
> skb is on diet anyway and there has been no official
> iproute2 release with the ematch bits included it might be
> a better idea to remove the ids from the header completely.
> Those that have picked up my patch on netdev shouldn't care
> about a ABI breakage, actually I doubt that someone is using
> it already.

So here's the patch to remove it.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tc_ematch/tc_em_meta.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index bcb762d93123..90ab9f95d43d 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -53,7 +53,6 @@ enum
 	TCF_META_ID_NFMARK,
 	TCF_META_ID_TCINDEX,
 	TCF_META_ID_TCVERDICT,
-	TCF_META_ID_TCCLASSID,
 	TCF_META_ID_RTCLASSID,
 	TCF_META_ID_RTIIF,
 	TCF_META_ID_SK_FAMILY,
-- 
cgit v1.2.3-59-g8ed1b


From e2bf521d9728bfae9b6c3d484614e5962d0b5afd Mon Sep 17 00:00:00 2001
From: Victor Fusco <victor@cetuc.puc-rio.br>
Date: Mon, 18 Jul 2005 13:36:38 -0700
Subject: [NET]: Fix "nocast type" warnings in skbuff.h

From: Victor Fusco <victor@cetuc.puc-rio.br>

Fix the sparse warning "implicit cast to nocast type"

Signed-off-by: Victor Fusco <victor@cetuc.puc-rio.br>
Signed-off-by: Domen Puncer <domen@coderock.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5d4a990d5577..0061c9470482 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -502,7 +502,8 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb,
  *
  *	%NULL is returned on a memory allocation failure.
  */
-static inline struct sk_buff *skb_unshare(struct sk_buff *skb, int pri)
+static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
+					  unsigned int __nocast pri)
 {
 	might_sleep_if(pri & __GFP_WAIT);
 	if (skb_cloned(skb)) {
-- 
cgit v1.2.3-59-g8ed1b


From 4acdbdbe5089c06d5e0c7e96783fcc4414ded00a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 21 Jul 2005 13:14:46 -0700
Subject: [NETFILTER]: ip_conntrack_expect_related must not free expectation

If a connection tracking helper tells us to expect a connection, and
we're already expecting that connection, we simply free the one they
gave us and return success.

The problem is that NAT helpers (eg. FTP) have to allocate the
expectation first (to see what port is available) then rewrite the
packet.  If that rewrite fails, they try to remove the expectation,
but it was freed in ip_conntrack_expect_related.

This is one example of a larger problem: having registered the
expectation, the pointer is no longer ours to use.  Reference counting
is needed for ctnetlink anyway, so introduce it now.

To have a single "put" path, we need to grab the reference to the
connection on creation, rather than open-coding it in the caller.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h        |  3 ++
 include/linux/netfilter_ipv4/ip_conntrack_helper.h |  7 ++--
 net/ipv4/netfilter/ip_conntrack_amanda.c           |  8 ++---
 net/ipv4/netfilter/ip_conntrack_core.c             | 40 ++++++++++------------
 net/ipv4/netfilter/ip_conntrack_ftp.c              | 14 ++++----
 net/ipv4/netfilter/ip_conntrack_irc.c              |  8 ++---
 net/ipv4/netfilter/ip_conntrack_standalone.c       |  2 +-
 net/ipv4/netfilter/ip_conntrack_tftp.c             |  8 ++---
 net/ipv4/netfilter/ip_nat_amanda.c                 |  4 +--
 net/ipv4/netfilter/ip_nat_ftp.c                    |  4 +--
 net/ipv4/netfilter/ip_nat_irc.c                    |  4 +--
 net/ipv4/netfilter/ip_nat_tftp.c                   |  4 +--
 12 files changed, 46 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 3781192ce159..f8da7ddeff3a 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -197,6 +197,9 @@ struct ip_conntrack_expect
 	/* Timer function; deletes the expectation. */
 	struct timer_list timeout;
 
+	/* Usage count. */
+	atomic_t use;
+
 #ifdef CONFIG_IP_NF_NAT_NEEDED
 	/* This is the original per-proto part, used to map the
 	 * expected connection the way the recipient expects. */
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper.h b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
index b1bbba0a12cb..3692daa93dec 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_helper.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_helper.h
@@ -30,9 +30,10 @@ extern int ip_conntrack_helper_register(struct ip_conntrack_helper *);
 extern void ip_conntrack_helper_unregister(struct ip_conntrack_helper *);
 
 /* Allocate space for an expectation: this is mandatory before calling 
-   ip_conntrack_expect_related. */
-extern struct ip_conntrack_expect *ip_conntrack_expect_alloc(void);
-extern void ip_conntrack_expect_free(struct ip_conntrack_expect *exp);
+   ip_conntrack_expect_related.  You will have to call put afterwards. */
+extern struct ip_conntrack_expect *
+ip_conntrack_expect_alloc(struct ip_conntrack *master);
+extern void ip_conntrack_expect_put(struct ip_conntrack_expect *exp);
 
 /* Add an expected connection: can have more than one per connection */
 extern int ip_conntrack_expect_related(struct ip_conntrack_expect *exp);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
index a78a320eee08..01e1b58322a9 100644
--- a/net/ipv4/netfilter/ip_conntrack_amanda.c
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -101,14 +101,13 @@ static int help(struct sk_buff **pskb,
 		if (port == 0 || len > 5)
 			break;
 
-		exp = ip_conntrack_expect_alloc();
+		exp = ip_conntrack_expect_alloc(ct);
 		if (exp == NULL) {
 			ret = NF_DROP;
 			goto out;
 		}
 
 		exp->expectfn = NULL;
-		exp->master = ct;
 
 		exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
 		exp->tuple.src.u.tcp.port = 0;
@@ -126,10 +125,9 @@ static int help(struct sk_buff **pskb,
 			ret = ip_nat_amanda_hook(pskb, ctinfo,
 						 tmp - amanda_buffer,
 						 len, exp);
-		else if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		else if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		}
+		ip_conntrack_expect_put(exp);
 	}
 
 out:
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 4b78ebeb6635..14af55cad5d6 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -137,19 +137,12 @@ ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
 
 
 /* ip_conntrack_expect helper functions */
-static void destroy_expect(struct ip_conntrack_expect *exp)
-{
-	ip_conntrack_put(exp->master);
-	IP_NF_ASSERT(!timer_pending(&exp->timeout));
-	kmem_cache_free(ip_conntrack_expect_cachep, exp);
-	CONNTRACK_STAT_INC(expect_delete);
-}
-
 static void unlink_expect(struct ip_conntrack_expect *exp)
 {
 	ASSERT_WRITE_LOCK(&ip_conntrack_lock);
+	IP_NF_ASSERT(!timer_pending(&exp->timeout));
 	list_del(&exp->list);
-	/* Logically in destroy_expect, but we hold the lock here. */
+	CONNTRACK_STAT_INC(expect_delete);
 	exp->master->expecting--;
 }
 
@@ -160,7 +153,7 @@ static void expectation_timed_out(unsigned long ul_expect)
 	write_lock_bh(&ip_conntrack_lock);
 	unlink_expect(exp);
 	write_unlock_bh(&ip_conntrack_lock);
-	destroy_expect(exp);
+	ip_conntrack_expect_put(exp);
 }
 
 /* If an expectation for this connection is found, it gets delete from
@@ -198,7 +191,7 @@ static void remove_expectations(struct ip_conntrack *ct)
 	list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
 		if (i->master == ct && del_timer(&i->timeout)) {
 			unlink_expect(i);
-			destroy_expect(i);
+			ip_conntrack_expect_put(i);
 		}
 	}
 }
@@ -537,7 +530,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple,
 	if (exp) {
 		if (exp->expectfn)
 			exp->expectfn(conntrack, exp);
-		destroy_expect(exp);
+		ip_conntrack_expect_put(exp);
 	}
 
 	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
@@ -729,14 +722,14 @@ void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
 		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
 			unlink_expect(i);
 			write_unlock_bh(&ip_conntrack_lock);
-			destroy_expect(i);
+			ip_conntrack_expect_put(i);
 			return;
 		}
 	}
 	write_unlock_bh(&ip_conntrack_lock);
 }
 
-struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
 {
 	struct ip_conntrack_expect *new;
 
@@ -745,18 +738,23 @@ struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
 		DEBUGP("expect_related: OOM allocating expect\n");
 		return NULL;
 	}
-	new->master = NULL;
+	new->master = me;
+	atomic_inc(&new->master->ct_general.use);
+	atomic_set(&new->use, 1);
 	return new;
 }
 
-void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
 {
-	kmem_cache_free(ip_conntrack_expect_cachep, expect);
+	if (atomic_dec_and_test(&exp->use)) {
+		ip_conntrack_put(exp->master);
+		kmem_cache_free(ip_conntrack_expect_cachep, exp);
+	}
 }
 
 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
 {
-	atomic_inc(&exp->master->ct_general.use);
+	atomic_inc(&exp->use);
 	exp->master->expecting++;
 	list_add(&exp->list, &ip_conntrack_expect_list);
 
@@ -778,7 +776,7 @@ static void evict_oldest_expect(struct ip_conntrack *master)
 		if (i->master == master) {
 			if (del_timer(&i->timeout)) {
 				unlink_expect(i);
-				destroy_expect(i);
+				ip_conntrack_expect_put(i);
 			}
 			break;
 		}
@@ -810,8 +808,6 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
 			/* Refresh timer: if it's dying, ignore.. */
 			if (refresh_timer(i)) {
 				ret = 0;
-				/* We don't need the one they've given us. */
-				ip_conntrack_expect_free(expect);
 				goto out;
 			}
 		} else if (expect_clash(i, expect)) {
@@ -881,7 +877,7 @@ void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
 	list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
 		if (exp->master->helper == me && del_timer(&exp->timeout)) {
 			unlink_expect(exp);
-			destroy_expect(exp);
+			ip_conntrack_expect_put(exp);
 		}
 	}
 	/* Get rid of expecteds, set helpers to NULL. */
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index fea6dd2a00b6..7a3b773be3f9 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -376,7 +376,7 @@ static int help(struct sk_buff **pskb,
 	       fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
 			 
 	/* Allocate expectation which will be inserted */
-	exp = ip_conntrack_expect_alloc();
+	exp = ip_conntrack_expect_alloc(ct);
 	if (exp == NULL) {
 		ret = NF_DROP;
 		goto out;
@@ -403,8 +403,7 @@ static int help(struct sk_buff **pskb,
 		   networks, or the packet filter itself). */
 		if (!loose) {
 			ret = NF_ACCEPT;
-			ip_conntrack_expect_free(exp);
-			goto out_update_nl;
+			goto out_put_expect;
 		}
 		exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
 					 | (array[2] << 8) | array[3]);
@@ -419,7 +418,6 @@ static int help(struct sk_buff **pskb,
 		  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 
 	exp->expectfn = NULL;
-	exp->master = ct;
 
 	/* Now, NAT might want to mangle the packet, and register the
 	 * (possibly changed) expectation itself. */
@@ -428,13 +426,15 @@ static int help(struct sk_buff **pskb,
 				      matchoff, matchlen, exp, &seq);
 	else {
 		/* Can't expect this?  Best to drop packet now. */
-		if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		} else
+		else
 			ret = NF_ACCEPT;
 	}
 
+out_put_expect:
+	ip_conntrack_expect_put(exp);
+
 out_update_nl:
 	/* Now if this ends in \n, update ftp info.  Seq may have been
 	 * adjusted by NAT code. */
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
index cd98772cc332..4a28f297d502 100644
--- a/net/ipv4/netfilter/ip_conntrack_irc.c
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -197,7 +197,7 @@ static int help(struct sk_buff **pskb,
 				continue;
 			}
 
-			exp = ip_conntrack_expect_alloc();
+			exp = ip_conntrack_expect_alloc(ct);
 			if (exp == NULL) {
 				ret = NF_DROP;
 				goto out;
@@ -221,16 +221,14 @@ static int help(struct sk_buff **pskb,
 				{ { 0, { 0 } },
 				  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
 			exp->expectfn = NULL;
-			exp->master = ct;
 			if (ip_nat_irc_hook)
 				ret = ip_nat_irc_hook(pskb, ctinfo, 
 						      addr_beg_p - ib_ptr,
 						      addr_end_p - addr_beg_p,
 						      exp);
-			else if (ip_conntrack_expect_related(exp) != 0) {
-				ip_conntrack_expect_free(exp);
+			else if (ip_conntrack_expect_related(exp) != 0)
 				ret = NF_DROP;
-			}
+			ip_conntrack_expect_put(exp);
 			goto out;
 		} /* for .. NUM_DCCPROTO */
 	} /* while data < ... */
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index 1dd824f3cf0a..61798c46e91d 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -985,7 +985,7 @@ EXPORT_SYMBOL(ip_ct_refresh_acct);
 EXPORT_SYMBOL(ip_ct_protos);
 EXPORT_SYMBOL(ip_ct_find_proto);
 EXPORT_SYMBOL(ip_conntrack_expect_alloc);
-EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_put);
 EXPORT_SYMBOL(ip_conntrack_expect_related);
 EXPORT_SYMBOL(ip_conntrack_unexpect_related);
 EXPORT_SYMBOL(ip_conntrack_tuple_taken);
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
index 992fac3e36ee..f8ff170f390a 100644
--- a/net/ipv4/netfilter/ip_conntrack_tftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -65,7 +65,7 @@ static int tftp_help(struct sk_buff **pskb,
 		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
 		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
 
-		exp = ip_conntrack_expect_alloc();
+		exp = ip_conntrack_expect_alloc(ct);
 		if (exp == NULL)
 			return NF_DROP;
 
@@ -75,17 +75,15 @@ static int tftp_help(struct sk_buff **pskb,
 		exp->mask.dst.u.udp.port = 0xffff;
 		exp->mask.dst.protonum = 0xff;
 		exp->expectfn = NULL;
-		exp->master = ct;
 
 		DEBUGP("expect: ");
 		DUMP_TUPLE(&exp->tuple);
 		DUMP_TUPLE(&exp->mask);
 		if (ip_nat_tftp_hook)
 			ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
-		else if (ip_conntrack_expect_related(exp) != 0) {
-			ip_conntrack_expect_free(exp);
+		else if (ip_conntrack_expect_related(exp) != 0)
 			ret = NF_DROP;
-		}
+		ip_conntrack_expect_put(exp);
 		break;
 	case TFTP_OPCODE_DATA:
 	case TFTP_OPCODE_ACK:
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
index da1f412583ed..706c8074f422 100644
--- a/net/ipv4/netfilter/ip_nat_amanda.c
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -56,10 +56,8 @@ static unsigned int help(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	sprintf(buffer, "%u", port);
 	ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
index c6000e794ad6..d83757a70d9f 100644
--- a/net/ipv4/netfilter/ip_nat_ftp.c
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -143,10 +143,8 @@ static unsigned int ip_nat_ftp(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
 			  seq)) {
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
index 9c1ca3381d56..de31942babe3 100644
--- a/net/ipv4/netfilter/ip_nat_irc.c
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -65,10 +65,8 @@ static unsigned int help(struct sk_buff **pskb,
 			break;
 	}
 
-	if (port == 0) {
-		ip_conntrack_expect_free(exp);
+	if (port == 0)
 		return NF_DROP;
-	}
 
 	/*      strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
 	 *      strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
index 0343e0d64674..2215317c76b7 100644
--- a/net/ipv4/netfilter/ip_nat_tftp.c
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -45,10 +45,8 @@ static unsigned int help(struct sk_buff **pskb,
 	exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
 	exp->dir = IP_CT_DIR_REPLY;
 	exp->expectfn = ip_nat_follow_master;
-	if (ip_conntrack_expect_related(exp) != 0) {
-		ip_conntrack_expect_free(exp);
+	if (ip_conntrack_expect_related(exp) != 0)
 		return NF_DROP;
-	}
 	return NF_ACCEPT;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 28e212fb360ce2568edd60b93d60683d5ad24146 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Jul 2005 11:47:25 -0700
Subject: [PKT_SCHED]: Kill TCF_META_ID_REALDEV from meta ematch.

It won't exist any longer when we shrink the SKB in 2.6.14,
and we should kill this off before anyone in userspace starts
using it.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/tc_ematch/tc_em_meta.h |  1 -
 net/sched/em_meta.c                  | 12 ------------
 2 files changed, 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index 90ab9f95d43d..150efe07ff6b 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -42,7 +42,6 @@ enum
 	TCF_META_ID_LOADAVG_2,
 	TCF_META_ID_DEV,
 	TCF_META_ID_INDEV,
-	TCF_META_ID_REALDEV,
 	TCF_META_ID_PRIORITY,
 	TCF_META_ID_PROTOCOL,
 	TCF_META_ID_SECURITY, /* obsolete */
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index feefcbfd03d0..179efb5bc9b3 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -180,16 +180,6 @@ META_COLLECTOR(var_indev)
 	*err = var_dev(skb->input_dev, dst);
 }
 
-META_COLLECTOR(int_realdev)
-{
-	*err = int_dev(skb->real_dev, dst);
-}
-
-META_COLLECTOR(var_realdev)
-{
-	*err = var_dev(skb->real_dev, dst);
-}
-
 /**************************************************************************
  * skb attributes
  **************************************************************************/
@@ -501,7 +491,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 	[TCF_META_TYPE_VAR] = {
 		[META_ID(DEV)]			= META_FUNC(var_dev),
 		[META_ID(INDEV)]		= META_FUNC(var_indev),
-		[META_ID(REALDEV)]		= META_FUNC(var_realdev),
 		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
 	},
 	[TCF_META_TYPE_INT] = {
@@ -511,7 +500,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
 		[META_ID(DEV)]			= META_FUNC(int_dev),
 		[META_ID(INDEV)]		= META_FUNC(int_indev),
-		[META_ID(REALDEV)]		= META_FUNC(int_realdev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
-- 
cgit v1.2.3-59-g8ed1b


From 261688d01ec07d3a265b8ace6ec68310fbd96a96 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 22 Jul 2005 14:43:52 -0700
Subject: [PKT_SCHED]: em_meta: Kill TCF_META_ID_{INDEV,SECURITY,TCVERDICT}

More unusable TCF_META_* match types that need to get eliminated
before 2.6.13 goes out the door.

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Thomas Graf <tgraf@suug.ch>
---
 include/linux/tc_ematch/tc_em_meta.h |  3 ---
 net/sched/em_meta.c                  | 28 +++-------------------------
 2 files changed, 3 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tc_ematch/tc_em_meta.h b/include/linux/tc_ematch/tc_em_meta.h
index 150efe07ff6b..081b1ee8516e 100644
--- a/include/linux/tc_ematch/tc_em_meta.h
+++ b/include/linux/tc_ematch/tc_em_meta.h
@@ -41,17 +41,14 @@ enum
 	TCF_META_ID_LOADAVG_1,
 	TCF_META_ID_LOADAVG_2,
 	TCF_META_ID_DEV,
-	TCF_META_ID_INDEV,
 	TCF_META_ID_PRIORITY,
 	TCF_META_ID_PROTOCOL,
-	TCF_META_ID_SECURITY, /* obsolete */
 	TCF_META_ID_PKTTYPE,
 	TCF_META_ID_PKTLEN,
 	TCF_META_ID_DATALEN,
 	TCF_META_ID_MACLEN,
 	TCF_META_ID_NFMARK,
 	TCF_META_ID_TCINDEX,
-	TCF_META_ID_TCVERDICT,
 	TCF_META_ID_RTCLASSID,
 	TCF_META_ID_RTIIF,
 	TCF_META_ID_SK_FAMILY,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 179efb5bc9b3..a18b924743d9 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -27,17 +27,17 @@
  * 	         lvalue                                   rvalue
  * 	      +-----------+                           +-----------+
  * 	      | type: INT |                           | type: INT |
- * 	 def  | id: INDEV |                           | id: VALUE |
+ * 	 def  | id: DEV   |                           | id: VALUE |
  * 	      | data:     |                           | data: 3   |
  * 	      +-----------+                           +-----------+
  * 	            |                                       |
- * 	            ---> meta_ops[INT][INDEV](...)          |
+ * 	            ---> meta_ops[INT][DEV](...)            |
  *	                      |                             |
  * 	            -----------                             |
  * 	            V                                       V
  * 	      +-----------+                           +-----------+
  * 	      | type: INT |                           | type: INT |
- * 	 obj  | id: INDEV |                           | id: VALUE |
+ * 	 obj  | id: DEV |                             | id: VALUE |
  * 	      | data: 2   |<--data got filled out     | data: 3   |
  * 	      +-----------+                           +-----------+
  * 	            |                                         |
@@ -170,16 +170,6 @@ META_COLLECTOR(var_dev)
 	*err = var_dev(skb->dev, dst);
 }
 
-META_COLLECTOR(int_indev)
-{
-	*err = int_dev(skb->input_dev, dst);
-}
-
-META_COLLECTOR(var_indev)
-{
-	*err = var_dev(skb->input_dev, dst);
-}
-
 /**************************************************************************
  * skb attributes
  **************************************************************************/
@@ -235,13 +225,6 @@ META_COLLECTOR(int_tcindex)
 	dst->value = skb->tc_index;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-META_COLLECTOR(int_tcverd)
-{
-	dst->value = skb->tc_verd;
-}
-#endif
-
 /**************************************************************************
  * Routing
  **************************************************************************/
@@ -490,7 +473,6 @@ struct meta_ops
 static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 	[TCF_META_TYPE_VAR] = {
 		[META_ID(DEV)]			= META_FUNC(var_dev),
-		[META_ID(INDEV)]		= META_FUNC(var_indev),
 		[META_ID(SK_BOUND_IF)] 		= META_FUNC(var_sk_bound_if),
 	},
 	[TCF_META_TYPE_INT] = {
@@ -499,7 +481,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(LOADAVG_1)]		= META_FUNC(int_loadavg_1),
 		[META_ID(LOADAVG_2)]		= META_FUNC(int_loadavg_2),
 		[META_ID(DEV)]			= META_FUNC(int_dev),
-		[META_ID(INDEV)]		= META_FUNC(int_indev),
 		[META_ID(PRIORITY)]		= META_FUNC(int_priority),
 		[META_ID(PROTOCOL)]		= META_FUNC(int_protocol),
 		[META_ID(PKTTYPE)]		= META_FUNC(int_pkttype),
@@ -510,9 +491,6 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
 		[META_ID(NFMARK)]		= META_FUNC(int_nfmark),
 #endif
 		[META_ID(TCINDEX)]		= META_FUNC(int_tcindex),
-#ifdef CONFIG_NET_CLS_ACT
-		[META_ID(TCVERDICT)]		= META_FUNC(int_tcverd),
-#endif
 #ifdef CONFIG_NET_CLS_ROUTE
 		[META_ID(RTCLASSID)]		= META_FUNC(int_rtclassid),
 #endif
-- 
cgit v1.2.3-59-g8ed1b


From 48647feed9f7a2d839c6ada12147b341833646e8 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Sun, 24 Jul 2005 19:30:28 -0700
Subject: [W1]: Do not use NFLOG netlink number.

Use the reserved by never used NETLINK_SKIP value instead.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/w1/w1_int.c     | 2 +-
 include/linux/netlink.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c
index 35e85d961702..b5a5e04b6d37 100644
--- a/drivers/w1/w1_int.c
+++ b/drivers/w1/w1_int.c
@@ -88,7 +88,7 @@ static struct w1_master * w1_alloc_dev(u32 id, int slave_count, int slave_ttl,
 
 	dev->groups = 23;
 	dev->seq = 1;
-	dev->nls = netlink_kernel_create(NETLINK_NFLOG, NULL);
+	dev->nls = netlink_kernel_create(NETLINK_W1, NULL);
 	if (!dev->nls) {
 		printk(KERN_ERR "Failed to create new netlink socket(%u) for w1 master %s.\n",
 			NETLINK_NFLOG, dev->dev.bus_id);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 2f0c085f2c7d..70c2a9dc4b2b 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -5,7 +5,7 @@
 #include <linux/types.h>
 
 #define NETLINK_ROUTE		0	/* Routing/device hook				*/
-#define NETLINK_SKIP		1	/* Reserved for ENskip  			*/
+#define NETLINK_W1		1	/* 1-wire subsystem				*/
 #define NETLINK_USERSOCK	2	/* Reserved for user mode socket protocols 	*/
 #define NETLINK_FIREWALL	3	/* Firewalling hook				*/
 #define NETLINK_TCPDIAG		4	/* TCP socket monitoring			*/
-- 
cgit v1.2.3-59-g8ed1b


From 4cf78e4fb678807e3f8265c9e9031a84f5c601f0 Mon Sep 17 00:00:00 2001
From: Michael Chan <mchan@broadcom.com>
Date: Mon, 25 Jul 2005 12:29:19 -0700
Subject: [TG3]: add 5780 basic support

Add 5780 PCI IDs, chip IDs, and other basic support.

Signed-off-by: Michael Chan <mchan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/tg3.c       | 53 ++++++++++++++++++++++++++++++++++++++++++++-----
 drivers/net/tg3.h       |  3 +++
 include/linux/pci_ids.h |  2 ++
 3 files changed, 53 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 54640686e983..b01f6a07e5e7 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -221,6 +221,10 @@ static struct pci_device_id tg3_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5753F,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5780,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
+	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5780S,
+	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_BROADCOM, PCI_DEVICE_ID_TIGON3_5781,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0UL },
 	{ PCI_VENDOR_ID_SYSKONNECT, PCI_DEVICE_ID_SYSKONNECT_9DXX,
@@ -508,6 +512,9 @@ static void tg3_switch_clocks(struct tg3 *tp)
 	u32 clock_ctrl = tr32(TG3PCI_CLOCK_CTRL);
 	u32 orig_clock_ctrl;
 
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780)
+		return;
+
 	orig_clock_ctrl = clock_ctrl;
 	clock_ctrl &= (CLOCK_CTRL_FORCE_CLKRUN |
 		       CLOCK_CTRL_CLKRUN_OENABLE |
@@ -1145,6 +1152,8 @@ static int tg3_set_power_state(struct tg3 *tp, int state)
 		     CLOCK_CTRL_ALTCLK |
 		     CLOCK_CTRL_PWRDOWN_PLL133);
 		udelay(40);
+	} else if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) {
+		/* do nothing */
 	} else if (!((tp->tg3_flags2 & TG3_FLG2_5750_PLUS) &&
 		     (tp->tg3_flags & TG3_FLAG_ENABLE_ASF))) {
 		u32 newbits1, newbits2;
@@ -4056,7 +4065,30 @@ static int tg3_chip_reset(struct tg3 *tp)
 	val &= ~PCIX_CAPS_RELAXED_ORDERING;
 	pci_write_config_dword(tp->pdev, TG3PCI_X_CAPS, val);
 
-	tw32(MEMARB_MODE, MEMARB_MODE_ENABLE);
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) {
+		u32 val;
+
+		/* Chip reset on 5780 will reset MSI enable bit,
+		 * so need to restore it.
+		 */
+		if (tp->tg3_flags2 & TG3_FLG2_USING_MSI) {
+			u16 ctrl;
+
+			pci_read_config_word(tp->pdev,
+					     tp->msi_cap + PCI_MSI_FLAGS,
+					     &ctrl);
+			pci_write_config_word(tp->pdev,
+					      tp->msi_cap + PCI_MSI_FLAGS,
+					      ctrl | PCI_MSI_FLAGS_ENABLE);
+			val = tr32(MSGINT_MODE);
+			tw32(MSGINT_MODE, val | MSGINT_MODE_ENABLE);
+		}
+
+		val = tr32(MEMARB_MODE);
+		tw32(MEMARB_MODE, val | MEMARB_MODE_ENABLE);
+
+	} else
+		tw32(MEMARB_MODE, MEMARB_MODE_ENABLE);
 
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5750_A3) {
 		tg3_stop_fw(tp);
@@ -5683,7 +5715,8 @@ static int tg3_reset_hw(struct tg3 *tp)
 	tw32(MAC_RCV_RULE_1,  0x86000004 & RCV_RULE_DISABLE_MASK);
 	tw32(MAC_RCV_VALUE_1, 0xffffffff & RCV_RULE_DISABLE_MASK);
 
-	if (tp->tg3_flags2 & TG3_FLG2_5705_PLUS)
+	if ((tp->tg3_flags2 & TG3_FLG2_5705_PLUS) &&
+	    (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5780))
 		limit = 8;
 	else
 		limit = 16;
@@ -8928,6 +8961,10 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5752_A0_HW)
 		tp->pci_chip_rev_id = CHIPREV_ID_5752_A0;
 
+	/* Find msi capability. */
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780)
+		tp->msi_cap = pci_find_capability(tp->pdev, PCI_CAP_ID_MSI);
+
 	/* Initialize misc host control in PCI block. */
 	tp->misc_host_ctrl |= (misc_ctrl_reg &
 			       MISC_HOST_CTRL_CHIPREV);
@@ -8943,7 +8980,8 @@ static int __devinit tg3_get_invariants(struct tg3 *tp)
 	tp->pci_bist         = (cacheline_sz_reg >> 24) & 0xff;
 
 	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5750 ||
-	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752)
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5752 ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780)
 		tp->tg3_flags2 |= TG3_FLG2_5750_PLUS;
 
 	if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5705) ||
@@ -9305,8 +9343,9 @@ static int __devinit tg3_get_device_address(struct tg3 *tp)
 #endif
 
 	mac_offset = 0x7c;
-	if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704 &&
-	    !(tp->tg3_flags & TG3_FLG2_SUN_570X)) {
+	if ((GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5704 &&
+	     !(tp->tg3_flags & TG3_FLG2_SUN_570X)) ||
+	    GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) {
 		if (tr32(TG3PCI_DUAL_MAC_CTRL) & DUAL_MAC_CTRL_ID)
 			mac_offset = 0xcc;
 		if (tg3_nvram_lock(tp))
@@ -9620,6 +9659,9 @@ static int __devinit tg3_test_dma(struct tg3 *tp)
 
 			/* Set bit 23 to enable PCIX hw bug fix */
 			tp->dma_rwctrl |= 0x009f0000;
+		} else if (GET_ASIC_REV(tp->pci_chip_rev_id) == ASIC_REV_5780) {
+			/* 5780 always in PCIX mode */
+			tp->dma_rwctrl |= 0x00144000;
 		} else {
 			tp->dma_rwctrl |= 0x001b000f;
 		}
@@ -9803,6 +9845,7 @@ static char * __devinit tg3_phy_string(struct tg3 *tp)
 	case PHY_ID_BCM5705:	return "5705";
 	case PHY_ID_BCM5750:	return "5750";
 	case PHY_ID_BCM5752:	return "5752";
+	case PHY_ID_BCM5780:	return "5780";
 	case PHY_ID_BCM8002:	return "8002/serdes";
 	case 0:			return "serdes";
 	default:		return "unknown";
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index 70ad450733e6..46fa105fce83 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -136,6 +136,7 @@
 #define   ASIC_REV_5705			 0x03
 #define   ASIC_REV_5750			 0x04
 #define   ASIC_REV_5752			 0x06
+#define   ASIC_REV_5780			 0x08
 #define  GET_CHIP_REV(CHIP_REV_ID)	((CHIP_REV_ID) >> 8)
 #define   CHIPREV_5700_AX		 0x70
 #define   CHIPREV_5700_BX		 0x71
@@ -2187,6 +2188,7 @@ struct tg3 {
 	u8				pci_bist;
 
 	int				pm_cap;
+	int				msi_cap;
 
 	/* PHY info */
 	u32				phy_id;
@@ -2200,6 +2202,7 @@ struct tg3 {
 #define PHY_ID_BCM5705			0x600081a0
 #define PHY_ID_BCM5750			0x60008180
 #define PHY_ID_BCM5752			0x60008100
+#define PHY_ID_BCM5780			0x60008350
 #define PHY_ID_BCM8002			0x60010140
 #define PHY_ID_INVALID			0xffffffff
 #define PHY_ID_REV_MASK			0x0000000f
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 9a28b312eeb4..9e1337783a31 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2098,6 +2098,8 @@
 #define PCI_DEVICE_ID_TIGON3_5721	0x1659
 #define PCI_DEVICE_ID_TIGON3_5705M	0x165d
 #define PCI_DEVICE_ID_TIGON3_5705M_2	0x165e
+#define PCI_DEVICE_ID_TIGON3_5780	0x166a
+#define PCI_DEVICE_ID_TIGON3_5780S	0x166b
 #define PCI_DEVICE_ID_TIGON3_5705F	0x166e
 #define PCI_DEVICE_ID_TIGON3_5750	0x1676
 #define PCI_DEVICE_ID_TIGON3_5751	0x1677
-- 
cgit v1.2.3-59-g8ed1b


From 4a00ea1e18228e5ef99d4780671fda97226bda30 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:24:14 -0600
Subject: [PATCH] Refactor sys_reboot into reusable parts

Because the factors of sys_reboot don't exist people calling
into the reboot path duplicate the code badly, leading to
inconsistent expectations of code in the reboot path.

This patch should is just code motion.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/reboot.h |   9 +++++
 kernel/sys.c           | 106 +++++++++++++++++++++++++++++--------------------
 2 files changed, 73 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 2d4dd23168dd..828ba4f107d9 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -55,6 +55,15 @@ extern void machine_shutdown(void);
 struct pt_regs;
 extern void machine_crash_shutdown(struct pt_regs *);
 
+/* 
+ * Architecture independent implemenations of sys_reboot commands.
+ */
+
+extern void kernel_restart(char *cmd);
+extern void kernel_halt(void);
+extern void kernel_power_off(void);
+extern void kernel_kexec(void);
+
 #endif
 
 #endif /* _LINUX_REBOOT_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 5fc10d3e3891..7e033809ef5f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,62 @@ out_unlock:
 	return retval;
 }
 
+void kernel_restart(char *cmd)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
+	system_state = SYSTEM_RESTART;
+	device_suspend(PMSG_FREEZE);
+	device_shutdown();
+	if (!cmd) {
+		printk(KERN_EMERG "Restarting system.\n");
+	} else {
+		printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
+	}
+	printk(".\n");
+	machine_restart(cmd);
+}
+EXPORT_SYMBOL_GPL(kernel_restart);
+
+void kernel_kexec(void)
+{
+#ifdef CONFIG_KEXEC
+	struct kimage *image;
+	image = xchg(&kexec_image, 0);
+	if (!image) {
+		return;
+	}
+	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+	system_state = SYSTEM_RESTART;
+	device_suspend(PMSG_FREEZE);
+	device_shutdown();
+	printk(KERN_EMERG "Starting new kernel\n");
+	machine_shutdown();
+	machine_kexec(image);
+#endif
+}
+EXPORT_SYMBOL_GPL(kernel_kexec);
+
+void kernel_halt(void)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
+	system_state = SYSTEM_HALT;
+	device_suspend(PMSG_SUSPEND);
+	device_shutdown();
+	printk(KERN_EMERG "System halted.\n");
+	machine_halt();
+}
+EXPORT_SYMBOL_GPL(kernel_halt);
+
+void kernel_power_off(void)
+{
+	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
+	system_state = SYSTEM_POWER_OFF;
+	device_suspend(PMSG_SUSPEND);
+	device_shutdown();
+	printk(KERN_EMERG "Power down.\n");
+	machine_power_off();
+}
+EXPORT_SYMBOL_GPL(kernel_power_off);
 
 /*
  * Reboot system call: for obvious reasons only root may call it,
@@ -389,12 +445,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	lock_kernel();
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Restarting system.\n");
-		machine_restart(NULL);
+		kernel_restart(NULL);
 		break;
 
 	case LINUX_REBOOT_CMD_CAD_ON:
@@ -406,23 +457,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		break;
 
 	case LINUX_REBOOT_CMD_HALT:
-		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_state = SYSTEM_HALT;
-		device_suspend(PMSG_SUSPEND);
-		device_shutdown();
-		printk(KERN_EMERG "System halted.\n");
-		machine_halt();
+		kernel_halt();
 		unlock_kernel();
 		do_exit(0);
 		break;
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
-		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_state = SYSTEM_POWER_OFF;
-		device_suspend(PMSG_SUSPEND);
-		device_shutdown();
-		printk(KERN_EMERG "Power down.\n");
-		machine_power_off();
+		kernel_power_off();
 		unlock_kernel();
 		do_exit(0);
 		break;
@@ -434,33 +475,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		}
 		buffer[sizeof(buffer) - 1] = '\0';
 
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
-		machine_restart(buffer);
+		kernel_restart(buffer);
 		break;
 
-#ifdef CONFIG_KEXEC
 	case LINUX_REBOOT_CMD_KEXEC:
-	{
-		struct kimage *image;
-		image = xchg(&kexec_image, 0);
-		if (!image) {
-			unlock_kernel();
-			return -EINVAL;
-		}
-		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_state = SYSTEM_RESTART;
-		device_suspend(PMSG_FREEZE);
-		device_shutdown();
-		printk(KERN_EMERG "Starting new kernel\n");
-		machine_shutdown();
-		machine_kexec(image);
-		break;
-	}
-#endif
+		kernel_kexec();
+		unlock_kernel();
+		return -EINVAL;
+
 #ifdef CONFIG_SOFTWARE_SUSPEND
 	case LINUX_REBOOT_CMD_SW_SUSPEND:
 		{
-- 
cgit v1.2.3-59-g8ed1b


From 7c9034735eccbf82608a4602c59aaf6053ea9416 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 26 Jul 2005 11:29:55 -0600
Subject: [PATCH] Add emergency_restart()

When the kernel is working well and we want to restart cleanly
kernel_restart is the function to use.   But in many instances
the kernel wants to reboot when thing are expected to be working
very badly such as from panic or a software watchdog handler.

This patch adds the function emergency_restart() so that
callers can be clear what semantics they expect when calling
restart.  emergency_restart() is expected to be callable
from interrupt context and possibly reliable in even more
trying circumstances.

This is an initial generic implementation for all architectures.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/emergency-restart.h     | 6 ++++++
 include/asm-arm/emergency-restart.h       | 6 ++++++
 include/asm-arm26/emergency-restart.h     | 6 ++++++
 include/asm-cris/emergency-restart.h      | 6 ++++++
 include/asm-frv/emergency-restart.h       | 6 ++++++
 include/asm-generic/emergency-restart.h   | 9 +++++++++
 include/asm-h8300/emergency-restart.h     | 6 ++++++
 include/asm-i386/emergency-restart.h      | 6 ++++++
 include/asm-ia64/emergency-restart.h      | 6 ++++++
 include/asm-m32r/emergency-restart.h      | 6 ++++++
 include/asm-m68k/emergency-restart.h      | 6 ++++++
 include/asm-m68knommu/emergency-restart.h | 6 ++++++
 include/asm-mips/emergency-restart.h      | 6 ++++++
 include/asm-parisc/emergency-restart.h    | 6 ++++++
 include/asm-ppc/emergency-restart.h       | 6 ++++++
 include/asm-ppc64/emergency-restart.h     | 6 ++++++
 include/asm-s390/emergency-restart.h      | 6 ++++++
 include/asm-sh/emergency-restart.h        | 6 ++++++
 include/asm-sh64/emergency-restart.h      | 6 ++++++
 include/asm-sparc/emergency-restart.h     | 6 ++++++
 include/asm-sparc64/emergency-restart.h   | 6 ++++++
 include/asm-um/emergency-restart.h        | 6 ++++++
 include/asm-v850/emergency-restart.h      | 6 ++++++
 include/asm-x86_64/emergency-restart.h    | 6 ++++++
 include/asm-xtensa/emergency-restart.h    | 6 ++++++
 include/linux/reboot.h                    | 7 +++++++
 kernel/sys.c                              | 6 ++++++
 27 files changed, 166 insertions(+)
 create mode 100644 include/asm-alpha/emergency-restart.h
 create mode 100644 include/asm-arm/emergency-restart.h
 create mode 100644 include/asm-arm26/emergency-restart.h
 create mode 100644 include/asm-cris/emergency-restart.h
 create mode 100644 include/asm-frv/emergency-restart.h
 create mode 100644 include/asm-generic/emergency-restart.h
 create mode 100644 include/asm-h8300/emergency-restart.h
 create mode 100644 include/asm-i386/emergency-restart.h
 create mode 100644 include/asm-ia64/emergency-restart.h
 create mode 100644 include/asm-m32r/emergency-restart.h
 create mode 100644 include/asm-m68k/emergency-restart.h
 create mode 100644 include/asm-m68knommu/emergency-restart.h
 create mode 100644 include/asm-mips/emergency-restart.h
 create mode 100644 include/asm-parisc/emergency-restart.h
 create mode 100644 include/asm-ppc/emergency-restart.h
 create mode 100644 include/asm-ppc64/emergency-restart.h
 create mode 100644 include/asm-s390/emergency-restart.h
 create mode 100644 include/asm-sh/emergency-restart.h
 create mode 100644 include/asm-sh64/emergency-restart.h
 create mode 100644 include/asm-sparc/emergency-restart.h
 create mode 100644 include/asm-sparc64/emergency-restart.h
 create mode 100644 include/asm-um/emergency-restart.h
 create mode 100644 include/asm-v850/emergency-restart.h
 create mode 100644 include/asm-x86_64/emergency-restart.h
 create mode 100644 include/asm-xtensa/emergency-restart.h

(limited to 'include/linux')

diff --git a/include/asm-alpha/emergency-restart.h b/include/asm-alpha/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-alpha/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-arm/emergency-restart.h b/include/asm-arm/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-arm/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-arm26/emergency-restart.h b/include/asm-arm26/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-arm26/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-cris/emergency-restart.h b/include/asm-cris/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-cris/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-frv/emergency-restart.h b/include/asm-frv/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-frv/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-generic/emergency-restart.h b/include/asm-generic/emergency-restart.h
new file mode 100644
index 000000000000..0d68a1eae985
--- /dev/null
+++ b/include/asm-generic/emergency-restart.h
@@ -0,0 +1,9 @@
+#ifndef _ASM_GENERIC_EMERGENCY_RESTART_H
+#define _ASM_GENERIC_EMERGENCY_RESTART_H
+
+static inline void machine_emergency_restart(void)
+{
+	machine_restart(NULL);
+}
+
+#endif /* _ASM_GENERIC_EMERGENCY_RESTART_H */
diff --git a/include/asm-h8300/emergency-restart.h b/include/asm-h8300/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-h8300/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-i386/emergency-restart.h b/include/asm-i386/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-i386/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-ia64/emergency-restart.h b/include/asm-ia64/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-ia64/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-m32r/emergency-restart.h b/include/asm-m32r/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-m32r/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-m68k/emergency-restart.h b/include/asm-m68k/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-m68k/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-m68knommu/emergency-restart.h b/include/asm-m68knommu/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-m68knommu/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-mips/emergency-restart.h b/include/asm-mips/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-mips/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-parisc/emergency-restart.h b/include/asm-parisc/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-parisc/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-ppc/emergency-restart.h b/include/asm-ppc/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-ppc/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-ppc64/emergency-restart.h b/include/asm-ppc64/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-ppc64/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-s390/emergency-restart.h b/include/asm-s390/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-s390/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-sh/emergency-restart.h b/include/asm-sh/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-sh/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-sh64/emergency-restart.h b/include/asm-sh64/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-sh64/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-sparc/emergency-restart.h b/include/asm-sparc/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-sparc/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-sparc64/emergency-restart.h b/include/asm-sparc64/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-sparc64/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-um/emergency-restart.h b/include/asm-um/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-um/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-v850/emergency-restart.h b/include/asm-v850/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-v850/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-x86_64/emergency-restart.h b/include/asm-x86_64/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-x86_64/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/asm-xtensa/emergency-restart.h b/include/asm-xtensa/emergency-restart.h
new file mode 100644
index 000000000000..108d8c48e42e
--- /dev/null
+++ b/include/asm-xtensa/emergency-restart.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_EMERGENCY_RESTART_H
+#define _ASM_EMERGENCY_RESTART_H
+
+#include <asm-generic/emergency-restart.h>
+
+#endif /* _ASM_EMERGENCY_RESTART_H */
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 828ba4f107d9..3b3266ff1a95 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -64,6 +64,13 @@ extern void kernel_halt(void);
 extern void kernel_power_off(void);
 extern void kernel_kexec(void);
 
+/*
+ * Emergency restart, callable from an interrupt handler.
+ */
+
+extern void emergency_restart(void);
+#include <asm/emergency-restart.h>
+
 #endif
 
 #endif /* _LINUX_REBOOT_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 31ac41a73329..a74039036fb4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,6 +361,12 @@ out_unlock:
 	return retval;
 }
 
+void emergency_restart(void)
+{
+	machine_emergency_restart();
+}
+EXPORT_SYMBOL_GPL(emergency_restart);
+
 void kernel_restart(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
-- 
cgit v1.2.3-59-g8ed1b


From cadf01c2fc0cd66dfef4956ef1a6482ed01c3150 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 26 Jul 2005 15:39:28 -0700
Subject: [NETFILTER]: Fix ip_conntrack_put() prototype.

The function is not inline.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter_ipv4/ip_conntrack.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index f8da7ddeff3a..08fe5f7d14a0 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -239,7 +239,7 @@ ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
 }
 
 /* decrement reference count on a conntrack */
-extern inline void ip_conntrack_put(struct ip_conntrack *ct);
+extern void ip_conntrack_put(struct ip_conntrack *ct);
 
 /* call to create an explicit dependency on ip_conntrack. */
 extern void need_ip_conntrack(void);
-- 
cgit v1.2.3-59-g8ed1b


From c10b873695c6a1de0d8ebab40b525575ca576683 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 27 Jul 2005 11:43:27 -0700
Subject: [PATCH] Really __nocast-annotate kmalloc_node()

One chunk was lost somewhere between my and Andrew's machine.

Noticed by Victor Fusco.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 4c8e552471b0..80b2dfde2e80 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -111,7 +111,7 @@ static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int n
 {
 	return kmem_cache_alloc(cachep, flags);
 }
-static inline void *kmalloc_node(size_t size, int flags, int node)
+static inline void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
 {
 	return kmalloc(size, flags);
 }
-- 
cgit v1.2.3-59-g8ed1b


From 49f29915856435ad8e34a4a3a907b09682a5826e Mon Sep 17 00:00:00 2001
From: Olivier Blin <oblin@mandriva.com>
Date: Wed, 27 Jul 2005 11:43:47 -0700
Subject: [PATCH] i4l: add Olitec ISDN PCI card in hisax gazel driver

This patch adds support for the Olitec ISDN PCI card in the hisax gazel
driver.  The gazel driver supports this card, but wasn't aware of its PCI
ids.  Users used to modify the PCI ids of a supported card in
include/linux/pci_ids.h and recompile their kernel to get this card
running, as said in most Howtos.  This patch makes the hisax gazel driver
recognize the PCI ids of the Olitec ISDN PCI card.

Signed-off-by: Olivier Blin <oblin@mandriva.com>
Signed-off-by: Karsten Keil <kkeil@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/isdn/hisax/config.c | 1 +
 drivers/isdn/hisax/gazel.c  | 9 +++++++--
 include/linux/pci_ids.h     | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c
index c542e6fb2bde..fbaab4352902 100644
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -1900,6 +1900,7 @@ static struct pci_device_id hisax_pci_tbl[] __initdata = {
 	{PCI_VENDOR_ID_PLX,      PCI_DEVICE_ID_PLX_R685,         PCI_ANY_ID, PCI_ANY_ID},
 	{PCI_VENDOR_ID_PLX,      PCI_DEVICE_ID_PLX_R753,         PCI_ANY_ID, PCI_ANY_ID},
 	{PCI_VENDOR_ID_PLX,      PCI_DEVICE_ID_PLX_DJINN_ITOO,   PCI_ANY_ID, PCI_ANY_ID},
+	{PCI_VENDOR_ID_PLX,      PCI_DEVICE_ID_PLX_OLITEC,       PCI_ANY_ID, PCI_ANY_ID},
 #endif
 #ifdef CONFIG_HISAX_QUADRO
 	{PCI_VENDOR_ID_PLX,      PCI_DEVICE_ID_PLX_9050,         PCI_ANY_ID, PCI_ANY_ID},
diff --git a/drivers/isdn/hisax/gazel.c b/drivers/isdn/hisax/gazel.c
index 352b45ac5347..60b04c6d9e7d 100644
--- a/drivers/isdn/hisax/gazel.c
+++ b/drivers/isdn/hisax/gazel.c
@@ -546,8 +546,9 @@ setup_gazelpci(struct IsdnCardState *cs)
 
 	found = 0;
 	seekcard = PCI_DEVICE_ID_PLX_R685;
-	for (nbseek = 0; nbseek < 3; nbseek++) {
-		if ((dev_tel = pci_find_device(PCI_VENDOR_ID_PLX, seekcard, dev_tel))) {
+	for (nbseek = 0; nbseek < 4; nbseek++) {
+		if ((dev_tel = pci_find_device(PCI_VENDOR_ID_PLX,
+					seekcard, dev_tel))) {
 			if (pci_enable_device(dev_tel))
 				return 1;
 			pci_irq = dev_tel->irq;
@@ -565,6 +566,9 @@ setup_gazelpci(struct IsdnCardState *cs)
 				case PCI_DEVICE_ID_PLX_R753:
 					seekcard = PCI_DEVICE_ID_PLX_DJINN_ITOO;
 					break;
+				case PCI_DEVICE_ID_PLX_DJINN_ITOO:
+					seekcard = PCI_DEVICE_ID_PLX_OLITEC;
+					break;
 			}
 		}
 	}
@@ -605,6 +609,7 @@ setup_gazelpci(struct IsdnCardState *cs)
 			break;
 		case PCI_DEVICE_ID_PLX_R753:
 		case PCI_DEVICE_ID_PLX_DJINN_ITOO:
+		case PCI_DEVICE_ID_PLX_OLITEC:
 			printk(KERN_INFO "Gazel: Card PCI R753 found\n");
 			cs->subtyp = R753;
 			test_and_set_bit(HW_IPAC, &cs->HW_Flags);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d2ad2c4f835a..bc4cc10fabe9 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1020,6 +1020,7 @@
 #define PCI_DEVICE_ID_PLX_SPCOM200	0x1103
 #define PCI_DEVICE_ID_PLX_DJINN_ITOO	0x1151
 #define PCI_DEVICE_ID_PLX_R753		0x1152
+#define PCI_DEVICE_ID_PLX_OLITEC	0x1187
 #define PCI_DEVICE_ID_PLX_9030		0x9030
 #define PCI_DEVICE_ID_PLX_9050		0x9050
 #define PCI_DEVICE_ID_PLX_9060		0x9060
-- 
cgit v1.2.3-59-g8ed1b


From 4bfdf37830111321e2cd1fe0102dd776ce93194d Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Wed, 27 Jul 2005 11:43:58 -0700
Subject: [PATCH] consolidate CONFIG_WATCHDOG_NOWAYOUT handling

Attached patch removes #ifdef CONFIG_WATCHDOG_NOWAYOUT mess duplicated in
almost every watchdog driver and replaces it with common define in
linux/watchdog.h.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/char/ipmi/ipmi_watchdog.c    |  6 +-----
 drivers/char/watchdog/acquirewdt.c   |  7 +------
 drivers/char/watchdog/advantechwdt.c |  7 +------
 drivers/char/watchdog/alim1535_wdt.c |  7 +------
 drivers/char/watchdog/alim7101_wdt.c |  7 +------
 drivers/char/watchdog/eurotechwdt.c  |  7 +------
 drivers/char/watchdog/i8xx_tco.c     |  7 +------
 drivers/char/watchdog/ib700wdt.c     |  7 +------
 drivers/char/watchdog/indydog.c      |  7 +------
 drivers/char/watchdog/ixp2000_wdt.c  |  6 +-----
 drivers/char/watchdog/ixp4xx_wdt.c   |  6 +-----
 drivers/char/watchdog/machzwd.c      |  7 +------
 drivers/char/watchdog/mixcomwd.c     |  7 +------
 drivers/char/watchdog/pcwd.c         |  7 +------
 drivers/char/watchdog/pcwd_pci.c     |  7 +------
 drivers/char/watchdog/pcwd_usb.c     |  7 +------
 drivers/char/watchdog/s3c2410_wdt.c  |  7 +------
 drivers/char/watchdog/sa1100_wdt.c   |  6 +-----
 drivers/char/watchdog/sbc60xxwdt.c   |  7 +------
 drivers/char/watchdog/sc1200wdt.c    |  7 +------
 drivers/char/watchdog/sc520_wdt.c    |  7 +------
 drivers/char/watchdog/scx200_wdt.c   |  6 +-----
 drivers/char/watchdog/shwdt.c        |  6 +-----
 drivers/char/watchdog/softdog.c      |  7 +------
 drivers/char/watchdog/w83627hf_wdt.c |  7 +------
 drivers/char/watchdog/w83877f_wdt.c  |  7 +------
 drivers/char/watchdog/wafer5823wdt.c |  7 +------
 drivers/char/watchdog/wdt.c          |  7 +------
 drivers/char/watchdog/wdt977.c       |  7 +------
 drivers/char/watchdog/wdt_pci.c      |  7 +------
 drivers/s390/char/vmwatchdog.c       |  6 +-----
 include/linux/watchdog.h             | 10 ++++++++++
 32 files changed, 41 insertions(+), 179 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index fcd1c02a32cb..d35a953961cb 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -131,11 +131,7 @@
 #define	WDIOC_GET_PRETIMEOUT     _IOW(WATCHDOG_IOCTL_BASE, 22, int)
 #endif
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout;
-#endif
+static int nowayout = WATCHDOG_NOWAYOUT;
 
 static ipmi_user_t watchdog_user = NULL;
 
diff --git a/drivers/char/watchdog/acquirewdt.c b/drivers/char/watchdog/acquirewdt.c
index 8f302121741b..7289f4af93d0 100644
--- a/drivers/char/watchdog/acquirewdt.c
+++ b/drivers/char/watchdog/acquirewdt.c
@@ -82,12 +82,7 @@ static int wdt_start = 0x443;
 module_param(wdt_start, int, 0);
 MODULE_PARM_DESC(wdt_start, "Acquire WDT 'start' io port (default 0x443)");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/advantechwdt.c b/drivers/char/watchdog/advantechwdt.c
index ea73c8379bdd..194a3fd36b91 100644
--- a/drivers/char/watchdog/advantechwdt.c
+++ b/drivers/char/watchdog/advantechwdt.c
@@ -73,12 +73,7 @@ static int timeout = WATCHDOG_TIMEOUT;	/* in seconds */
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. 1<= timeout <=63, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ".");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/alim1535_wdt.c b/drivers/char/watchdog/alim1535_wdt.c
index 0715fcf0aed4..8338ca300e2e 100644
--- a/drivers/char/watchdog/alim1535_wdt.c
+++ b/drivers/char/watchdog/alim1535_wdt.c
@@ -38,12 +38,7 @@ static int timeout = WATCHDOG_TIMEOUT;
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. (0<timeout<18000, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/alim7101_wdt.c b/drivers/char/watchdog/alim7101_wdt.c
index 90c091d9e0f5..c05ac188a4d7 100644
--- a/drivers/char/watchdog/alim7101_wdt.c
+++ b/drivers/char/watchdog/alim7101_wdt.c
@@ -75,12 +75,7 @@ static unsigned long wdt_is_open;
 static char wdt_expect_close;
 static struct pci_dev *alim7101_pmu;
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/eurotechwdt.c b/drivers/char/watchdog/eurotechwdt.c
index 2a29a511df7f..25c2f2575611 100644
--- a/drivers/char/watchdog/eurotechwdt.c
+++ b/drivers/char/watchdog/eurotechwdt.c
@@ -72,12 +72,7 @@ static char *ev = "int";
 
 #define WDT_TIMEOUT		60                /* 1 minute */
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/i8xx_tco.c b/drivers/char/watchdog/i8xx_tco.c
index 5d07ee59679d..f975dab1ddf9 100644
--- a/drivers/char/watchdog/i8xx_tco.c
+++ b/drivers/char/watchdog/i8xx_tco.c
@@ -105,12 +105,7 @@ static int heartbeat = WATCHDOG_HEARTBEAT;  /* in seconds */
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (2<heartbeat<39, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/ib700wdt.c b/drivers/char/watchdog/ib700wdt.c
index d974f16e84d2..cf60329eec85 100644
--- a/drivers/char/watchdog/ib700wdt.c
+++ b/drivers/char/watchdog/ib700wdt.c
@@ -117,12 +117,7 @@ static int wd_times[] = {
 
 static int wd_margin = WD_TIMO;
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/indydog.c b/drivers/char/watchdog/indydog.c
index 6af2c799b57e..b4b94daba67e 100644
--- a/drivers/char/watchdog/indydog.c
+++ b/drivers/char/watchdog/indydog.c
@@ -29,14 +29,9 @@
 #define PFX "indydog: "
 static int indydog_alive;
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
 #define WATCHDOG_TIMEOUT 30		/* 30 sec default timeout */
 
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/ixp2000_wdt.c b/drivers/char/watchdog/ixp2000_wdt.c
index 4b039516cc86..e7640bc4904b 100644
--- a/drivers/char/watchdog/ixp2000_wdt.c
+++ b/drivers/char/watchdog/ixp2000_wdt.c
@@ -30,11 +30,7 @@
 #include <asm/hardware.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
+static int nowayout = WATCHDOG_NOWAYOUT;
 static unsigned int heartbeat = 60;	/* (secs) Default is 1 minute */
 static unsigned long wdt_status;
 
diff --git a/drivers/char/watchdog/ixp4xx_wdt.c b/drivers/char/watchdog/ixp4xx_wdt.c
index 83df369113a4..8d916afbf4fa 100644
--- a/drivers/char/watchdog/ixp4xx_wdt.c
+++ b/drivers/char/watchdog/ixp4xx_wdt.c
@@ -27,11 +27,7 @@
 #include <asm/hardware.h>
 #include <asm/uaccess.h>
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
+static int nowayout = WATCHDOG_NOWAYOUT;
 static int heartbeat = 60;	/* (secs) Default is 1 minute */
 static unsigned long wdt_status;
 static unsigned long boot_status;
diff --git a/drivers/char/watchdog/machzwd.c b/drivers/char/watchdog/machzwd.c
index 9da395fa7794..a9a20aad61e7 100644
--- a/drivers/char/watchdog/machzwd.c
+++ b/drivers/char/watchdog/machzwd.c
@@ -94,12 +94,7 @@ MODULE_DESCRIPTION("MachZ ZF-Logic Watchdog driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/mixcomwd.c b/drivers/char/watchdog/mixcomwd.c
index 3143e4a07535..c9b301dccec3 100644
--- a/drivers/char/watchdog/mixcomwd.c
+++ b/drivers/char/watchdog/mixcomwd.c
@@ -62,12 +62,7 @@ static int mixcomwd_timer_alive;
 static struct timer_list mixcomwd_timer = TIMER_INITIALIZER(NULL, 0, 0);
 static char expect_close;
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/pcwd.c b/drivers/char/watchdog/pcwd.c
index 6ebce3f2ef9c..427ad51b7a35 100644
--- a/drivers/char/watchdog/pcwd.c
+++ b/drivers/char/watchdog/pcwd.c
@@ -146,12 +146,7 @@ static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (2<=heartbeat<=7200, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/pcwd_pci.c b/drivers/char/watchdog/pcwd_pci.c
index 8ce066627326..2b13afb09c5d 100644
--- a/drivers/char/watchdog/pcwd_pci.c
+++ b/drivers/char/watchdog/pcwd_pci.c
@@ -103,12 +103,7 @@ static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/pcwd_usb.c b/drivers/char/watchdog/pcwd_usb.c
index 1127201d73b8..092e9b133750 100644
--- a/drivers/char/watchdog/pcwd_usb.c
+++ b/drivers/char/watchdog/pcwd_usb.c
@@ -79,12 +79,7 @@ static int heartbeat = WATCHDOG_HEARTBEAT;
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536, default=" __MODULE_STRING(WATCHDOG_HEARTBEAT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/s3c2410_wdt.c b/drivers/char/watchdog/s3c2410_wdt.c
index 1699d2c28ce5..f85ac898a49a 100644
--- a/drivers/char/watchdog/s3c2410_wdt.c
+++ b/drivers/char/watchdog/s3c2410_wdt.c
@@ -62,12 +62,7 @@
 #define CONFIG_S3C2410_WATCHDOG_ATBOOT		(0)
 #define CONFIG_S3C2410_WATCHDOG_DEFAULT_TIME	(15)
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 static int tmr_margin	= CONFIG_S3C2410_WATCHDOG_DEFAULT_TIME;
 static int tmr_atboot	= CONFIG_S3C2410_WATCHDOG_ATBOOT;
 static int soft_noboot	= 0;
diff --git a/drivers/char/watchdog/sa1100_wdt.c b/drivers/char/watchdog/sa1100_wdt.c
index 34e8f7b15e30..1b2132617dc3 100644
--- a/drivers/char/watchdog/sa1100_wdt.c
+++ b/drivers/char/watchdog/sa1100_wdt.c
@@ -42,11 +42,7 @@ static unsigned long sa1100wdt_users;
 static int expect_close;
 static int pre_margin;
 static int boot_status;
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
+static int nowayout = WATCHDOG_NOWAYOUT;
 
 /*
  *	Allow only one person to hold it open
diff --git a/drivers/char/watchdog/sbc60xxwdt.c b/drivers/char/watchdog/sbc60xxwdt.c
index d7de9880605a..ed0bd55fbfc1 100644
--- a/drivers/char/watchdog/sbc60xxwdt.c
+++ b/drivers/char/watchdog/sbc60xxwdt.c
@@ -98,12 +98,7 @@ static int timeout = WATCHDOG_TIMEOUT;	/* in seconds, will be multiplied by HZ t
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. (1<=timeout<=3600, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/sc1200wdt.c b/drivers/char/watchdog/sc1200wdt.c
index 24401e84729e..515ce7572049 100644
--- a/drivers/char/watchdog/sc1200wdt.c
+++ b/drivers/char/watchdog/sc1200wdt.c
@@ -91,12 +91,7 @@ MODULE_PARM_DESC(io, "io port");
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "range is 0-255 minutes, default is 1");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/sc520_wdt.c b/drivers/char/watchdog/sc520_wdt.c
index f6d143e1900d..72501be79b0c 100644
--- a/drivers/char/watchdog/sc520_wdt.c
+++ b/drivers/char/watchdog/sc520_wdt.c
@@ -94,12 +94,7 @@ static int timeout = WATCHDOG_TIMEOUT;	/* in seconds, will be multiplied by HZ t
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. (1<=timeout<=3600, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/scx200_wdt.c b/drivers/char/watchdog/scx200_wdt.c
index b569670e4ed5..c4568569f3a8 100644
--- a/drivers/char/watchdog/scx200_wdt.c
+++ b/drivers/char/watchdog/scx200_wdt.c
@@ -39,15 +39,11 @@ MODULE_DESCRIPTION("NatSemi SCx200 Watchdog Driver");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
 
-#ifndef CONFIG_WATCHDOG_NOWAYOUT
-#define CONFIG_WATCHDOG_NOWAYOUT 0
-#endif
-
 static int margin = 60;		/* in seconds */
 module_param(margin, int, 0);
 MODULE_PARM_DESC(margin, "Watchdog margin in seconds");
 
-static int nowayout = CONFIG_WATCHDOG_NOWAYOUT;
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Disable watchdog shutdown on close");
 
diff --git a/drivers/char/watchdog/shwdt.c b/drivers/char/watchdog/shwdt.c
index 3bc9272a474c..1f4cab55b2ef 100644
--- a/drivers/char/watchdog/shwdt.c
+++ b/drivers/char/watchdog/shwdt.c
@@ -75,11 +75,7 @@ static unsigned long next_heartbeat;
 #define WATCHDOG_HEARTBEAT 30			/* 30 sec default heartbeat */
 static int heartbeat = WATCHDOG_HEARTBEAT;	/* in seconds */
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
+static int nowayout = WATCHDOG_NOWAYOUT;
 
 /**
  * 	sh_wdt_start - Start the Watchdog
diff --git a/drivers/char/watchdog/softdog.c b/drivers/char/watchdog/softdog.c
index 98c7578740e2..4d7ed931f5c6 100644
--- a/drivers/char/watchdog/softdog.c
+++ b/drivers/char/watchdog/softdog.c
@@ -56,12 +56,7 @@ static int soft_margin = TIMER_MARGIN;	/* in seconds */
 module_param(soft_margin, int, 0);
 MODULE_PARM_DESC(soft_margin, "Watchdog soft_margin in seconds. (0<soft_margin<65536, default=" __MODULE_STRING(TIMER_MARGIN) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/w83627hf_wdt.c b/drivers/char/watchdog/w83627hf_wdt.c
index 813c97038f84..465e0fd0423d 100644
--- a/drivers/char/watchdog/w83627hf_wdt.c
+++ b/drivers/char/watchdog/w83627hf_wdt.c
@@ -54,12 +54,7 @@ static int timeout = WATCHDOG_TIMEOUT;	/* in seconds */
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. 1<= timeout <=63, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ".");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/w83877f_wdt.c b/drivers/char/watchdog/w83877f_wdt.c
index bccbd4d6ac2d..52a8bd0a5988 100644
--- a/drivers/char/watchdog/w83877f_wdt.c
+++ b/drivers/char/watchdog/w83877f_wdt.c
@@ -85,12 +85,7 @@ module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. (1<=timeout<=3600, default=" __MODULE_STRING(WATCHDOG_TIMEOUT) ")");
 
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/wafer5823wdt.c b/drivers/char/watchdog/wafer5823wdt.c
index abb0bea45c02..7cf6c9bbf486 100644
--- a/drivers/char/watchdog/wafer5823wdt.c
+++ b/drivers/char/watchdog/wafer5823wdt.c
@@ -63,12 +63,7 @@ static int timeout = WD_TIMO;  /* in seconds */
 module_param(timeout, int, 0);
 MODULE_PARM_DESC(timeout, "Watchdog timeout in seconds. 1<= timeout <=255, default=" __MODULE_STRING(WD_TIMO) ".");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/wdt.c b/drivers/char/watchdog/wdt.c
index 1210ca0c425b..ec7e401228ee 100644
--- a/drivers/char/watchdog/wdt.c
+++ b/drivers/char/watchdog/wdt.c
@@ -63,12 +63,7 @@ static int wd_heartbeat;
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536, default=" __MODULE_STRING(WD_TIMO) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/wdt977.c b/drivers/char/watchdog/wdt977.c
index 072e9b214759..44d49dfacbb3 100644
--- a/drivers/char/watchdog/wdt977.c
+++ b/drivers/char/watchdog/wdt977.c
@@ -53,12 +53,7 @@ MODULE_PARM_DESC(timeout,"Watchdog timeout in seconds (60..15300), default=" __M
 module_param(testmode, int, 0);
 MODULE_PARM_DESC(testmode,"Watchdog testmode (1 = no reboot), default=0");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/char/watchdog/wdt_pci.c b/drivers/char/watchdog/wdt_pci.c
index c80cb77b92fb..4b3311993d48 100644
--- a/drivers/char/watchdog/wdt_pci.c
+++ b/drivers/char/watchdog/wdt_pci.c
@@ -89,12 +89,7 @@ static int wd_heartbeat;
 module_param(heartbeat, int, 0);
 MODULE_PARM_DESC(heartbeat, "Watchdog heartbeat in seconds. (0<heartbeat<65536, default=" __MODULE_STRING(WD_TIMO) ")");
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int nowayout = 1;
-#else
-static int nowayout = 0;
-#endif
-
+static int nowayout = WATCHDOG_NOWAYOUT;
 module_param(nowayout, int, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=CONFIG_WATCHDOG_NOWAYOUT)");
 
diff --git a/drivers/s390/char/vmwatchdog.c b/drivers/s390/char/vmwatchdog.c
index 22cf4fec8da9..5473c23fcb52 100644
--- a/drivers/s390/char/vmwatchdog.c
+++ b/drivers/s390/char/vmwatchdog.c
@@ -23,11 +23,7 @@
 static char vmwdt_cmd[MAX_CMDLEN] = "IPL";
 static int vmwdt_conceal;
 
-#ifdef CONFIG_WATCHDOG_NOWAYOUT
-static int vmwdt_nowayout = 1;
-#else
-static int vmwdt_nowayout = 0;
-#endif
+static int vmwdt_nowayout = WATCHDOG_NOWAYOUT;
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arnd Bergmann <arndb@de.ibm.com>");
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 88ba0d29f8c8..1192ed8f4fe8 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -47,4 +47,14 @@ struct watchdog_info {
 #define	WDIOS_ENABLECARD	0x0002	/* Turn on the watchdog timer */
 #define	WDIOS_TEMPPANIC		0x0004	/* Kernel panic on temperature trip */
 
+#ifdef __KERNEL__
+
+#ifdef CONFIG_WATCHDOG_NOWAYOUT
+#define WATCHDOG_NOWAYOUT	1
+#else
+#define WATCHDOG_NOWAYOUT	0
+#endif
+
+#endif	/* __KERNEL__ */
+
 #endif  /* ifndef _LINUX_WATCHDOG_H */
-- 
cgit v1.2.3-59-g8ed1b


From 951f22d5b1f0eaae35dafc669e3774a0c2084d10 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Wed, 27 Jul 2005 11:44:57 -0700
Subject: [PATCH] s390: spin lock retry

Split spin lock and r/w lock implementation into a single try which is done
inline and an out of line function that repeatedly tries to get the lock
before doing the cpu_relax().  Add a system control to set the number of
retries before a cpu is yielded.

The reason for the spin lock retry is that the diagnose 0x44 that is used to
give up the virtual cpu is quite expensive.  For spin locks that are held only
for a short period of time the costs of the diagnoses outweights the savings
for spin locks that are held for a longer timer.  The default retry count is
1000.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/head64.S    |   3 -
 arch/s390/kernel/setup.c     |   6 --
 arch/s390/lib/Makefile       |   4 +-
 arch/s390/lib/spinlock.c     | 133 +++++++++++++++++++++++
 include/asm-s390/lowcore.h   |   4 +-
 include/asm-s390/processor.h |   5 +-
 include/asm-s390/spinlock.h  | 252 ++++++++++++++-----------------------------
 include/linux/sysctl.h       |   1 +
 kernel/sysctl.c              |  12 ++-
 9 files changed, 230 insertions(+), 190 deletions(-)
 create mode 100644 arch/s390/lib/spinlock.c

(limited to 'include/linux')

diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index f525c0c21250..28c50bdf7d40 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -658,10 +658,8 @@ startup:basr  %r13,0                     # get base
 #
 	la     %r1,0f-.LPG1(%r13)	# set program check address
 	stg    %r1,__LC_PGM_NEW_PSW+8
-	mvc    __LC_DIAG44_OPCODE(8),.Lnop-.LPG1(%r13)
 	diag   0,0,0x44			# test diag 0x44
 	oi     7(%r12),32		# set diag44 flag
-	mvc    __LC_DIAG44_OPCODE(8),.Ldiag44-.LPG1(%r13)
 0:	
 
 #
@@ -702,7 +700,6 @@ startup:basr  %r13,0                     # get base
 .L4malign:.quad 0xffffffffffc00000
 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8
 .Lnop:	.long  0x07000700
-.Ldiag44:.long 0x83000044
 
 	.org PARMAREA-64
 .Lduct:	.long 0,0,0,0,0,0,0,0
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index a12183989a79..5ba5a5485da9 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -431,12 +431,6 @@ setup_lowcore(void)
 		ctl_set_bit(14, 29);
 	}
 #endif
-#ifdef CONFIG_ARCH_S390X
-	if (MACHINE_HAS_DIAG44)
-		lc->diag44_opcode = 0x83000044;
-	else
-		lc->diag44_opcode = 0x07000700;
-#endif /* CONFIG_ARCH_S390X */
 	set_prefix((u32)(unsigned long) lc);
 }
 
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index a8758b1d20a9..b701efa1f00e 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -5,5 +5,5 @@
 EXTRA_AFLAGS := -traditional
 
 lib-y += delay.o string.o
-lib-$(CONFIG_ARCH_S390_31) += uaccess.o
-lib-$(CONFIG_ARCH_S390X) += uaccess64.o
+lib-$(CONFIG_ARCH_S390_31) += uaccess.o spinlock.o
+lib-$(CONFIG_ARCH_S390X) += uaccess64.o spinlock.o
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
new file mode 100644
index 000000000000..888b5596c195
--- /dev/null
+++ b/arch/s390/lib/spinlock.c
@@ -0,0 +1,133 @@
+/*
+ *  arch/s390/lib/spinlock.c
+ *    Out of line spinlock code.
+ *
+ *  S390 version
+ *    Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+atomic_t spin_retry_counter;
+int spin_retry = 1000;
+
+/**
+ * spin_retry= parameter
+ */
+static int __init spin_retry_setup(char *str)
+{
+	spin_retry = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("spin_retry=", spin_retry_setup);
+
+static inline void
+_diag44(void)
+{
+#ifdef __s390x__
+	if (MACHINE_HAS_DIAG44)
+#endif
+		asm volatile("diag 0,0,0x44");
+}
+
+void
+_raw_spin_lock_wait(spinlock_t *lp, unsigned int pc)
+{
+	int count = spin_retry;
+
+	while (1) {
+		if (count-- <= 0) {
+			_diag44();
+			count = spin_retry;
+		}
+		atomic_inc(&spin_retry_counter);
+		if (_raw_compare_and_swap(&lp->lock, 0, pc) == 0)
+			return;
+	}
+}
+EXPORT_SYMBOL(_raw_spin_lock_wait);
+
+int
+_raw_spin_trylock_retry(spinlock_t *lp, unsigned int pc)
+{
+	int count = spin_retry;
+
+	while (count-- > 0) {
+		atomic_inc(&spin_retry_counter);
+		if (_raw_compare_and_swap(&lp->lock, 0, pc) == 0)
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_retry);
+
+void
+_raw_read_lock_wait(rwlock_t *rw)
+{
+	unsigned int old;
+	int count = spin_retry;
+
+	while (1) {
+		if (count-- <= 0) {
+			_diag44();
+			count = spin_retry;
+		}
+		atomic_inc(&spin_retry_counter);
+		old = rw->lock & 0x7fffffffU;
+		if (_raw_compare_and_swap(&rw->lock, old, old + 1) == old)
+			return;
+	}
+}
+EXPORT_SYMBOL(_raw_read_lock_wait);
+
+int
+_raw_read_trylock_retry(rwlock_t *rw)
+{
+	unsigned int old;
+	int count = spin_retry;
+
+	while (count-- > 0) {
+		atomic_inc(&spin_retry_counter);
+		old = rw->lock & 0x7fffffffU;
+		if (_raw_compare_and_swap(&rw->lock, old, old + 1) == old)
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_raw_read_trylock_retry);
+
+void
+_raw_write_lock_wait(rwlock_t *rw)
+{
+	int count = spin_retry;
+
+	while (1) {
+		if (count-- <= 0) {
+			_diag44();
+			count = spin_retry;
+		}
+		atomic_inc(&spin_retry_counter);
+		if (_raw_compare_and_swap(&rw->lock, 0, 0x80000000) == 0)
+			return;
+	}
+}
+EXPORT_SYMBOL(_raw_write_lock_wait);
+
+int
+_raw_write_trylock_retry(rwlock_t *rw)
+{
+	int count = spin_retry;
+
+	while (count-- > 0) {
+		atomic_inc(&spin_retry_counter);
+		if (_raw_compare_and_swap(&rw->lock, 0, 0x80000000) == 0)
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_raw_write_trylock_retry);
diff --git a/include/asm-s390/lowcore.h b/include/asm-s390/lowcore.h
index 76b5b19c0ae2..afe6a9f9b0ae 100644
--- a/include/asm-s390/lowcore.h
+++ b/include/asm-s390/lowcore.h
@@ -90,7 +90,6 @@
 #define __LC_SYSTEM_TIMER		0x278
 #define __LC_LAST_UPDATE_CLOCK		0x280
 #define __LC_STEAL_CLOCK		0x288
-#define __LC_DIAG44_OPCODE		0x290
 #define __LC_KERNEL_STACK               0xD40
 #define __LC_THREAD_INFO		0xD48
 #define __LC_ASYNC_STACK                0xD50
@@ -286,8 +285,7 @@ struct _lowcore
 	__u64        system_timer;             /* 0x278 */
 	__u64        last_update_clock;        /* 0x280 */
 	__u64        steal_clock;              /* 0x288 */
-	__u32        diag44_opcode;            /* 0x290 */
-        __u8         pad8[0xc00-0x294];        /* 0x294 */
+        __u8         pad8[0xc00-0x290];        /* 0x290 */
         /* System info area */
 	__u64        save_area[16];            /* 0xc00 */
         __u8         pad9[0xd40-0xc80];        /* 0xc80 */
diff --git a/include/asm-s390/processor.h b/include/asm-s390/processor.h
index 8bd14de69e35..4ec652ebb3b1 100644
--- a/include/asm-s390/processor.h
+++ b/include/asm-s390/processor.h
@@ -203,7 +203,10 @@ unsigned long get_wchan(struct task_struct *p);
 # define cpu_relax()	asm volatile ("diag 0,0,68" : : : "memory")
 #else /* __s390x__ */
 # define cpu_relax() \
-	asm volatile ("ex 0,%0" : : "i" (__LC_DIAG44_OPCODE) : "memory")
+	do { \
+		if (MACHINE_HAS_DIAG44) \
+			asm volatile ("diag 0,0,68" : : : "memory"); \
+	} while (0)
 #endif /* __s390x__ */
 
 /*
diff --git a/include/asm-s390/spinlock.h b/include/asm-s390/spinlock.h
index 53cc736b9820..8ff10300f7ee 100644
--- a/include/asm-s390/spinlock.h
+++ b/include/asm-s390/spinlock.h
@@ -11,21 +11,16 @@
 #ifndef __ASM_SPINLOCK_H
 #define __ASM_SPINLOCK_H
 
-#ifdef __s390x__
-/*
- * Grmph, take care of %&#! user space programs that include
- * asm/spinlock.h. The diagnose is only available in kernel
- * context.
- */
-#ifdef __KERNEL__
-#include <asm/lowcore.h>
-#define __DIAG44_INSN "ex"
-#define __DIAG44_OPERAND __LC_DIAG44_OPCODE
-#else
-#define __DIAG44_INSN "#"
-#define __DIAG44_OPERAND 0
-#endif
-#endif /* __s390x__ */
+static inline int
+_raw_compare_and_swap(volatile unsigned int *lock,
+		      unsigned int old, unsigned int new)
+{
+	asm volatile ("cs %0,%3,0(%4)"
+		      : "=d" (old), "=m" (*lock)
+		      : "0" (old), "d" (new), "a" (lock), "m" (*lock)
+		      : "cc", "memory" );
+	return old;
+}
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
@@ -41,58 +36,35 @@ typedef struct {
 #endif
 } __attribute__ ((aligned (4))) spinlock_t;
 
-#define SPIN_LOCK_UNLOCKED (spinlock_t) { 0 }
-#define spin_lock_init(lp) do { (lp)->lock = 0; } while(0)
+#define SPIN_LOCK_UNLOCKED	(spinlock_t) { 0 }
+#define spin_lock_init(lp)	do { (lp)->lock = 0; } while(0)
 #define spin_unlock_wait(lp)	do { barrier(); } while(((volatile spinlock_t *)(lp))->lock)
-#define spin_is_locked(x) ((x)->lock != 0)
+#define spin_is_locked(x)	((x)->lock != 0)
 #define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
 
-extern inline void _raw_spin_lock(spinlock_t *lp)
+extern void _raw_spin_lock_wait(spinlock_t *lp, unsigned int pc);
+extern int _raw_spin_trylock_retry(spinlock_t *lp, unsigned int pc);
+
+static inline void _raw_spin_lock(spinlock_t *lp)
 {
-#ifndef __s390x__
-	unsigned int reg1, reg2;
-        __asm__ __volatile__("    bras  %0,1f\n"
-                           "0:  diag  0,0,68\n"
-                           "1:  slr   %1,%1\n"
-                           "    cs    %1,%0,0(%3)\n"
-                           "    jl    0b\n"
-                           : "=&d" (reg1), "=&d" (reg2), "=m" (lp->lock)
-			   : "a" (&lp->lock), "m" (lp->lock)
-			   : "cc", "memory" );
-#else /* __s390x__ */
-	unsigned long reg1, reg2;
-        __asm__ __volatile__("    bras  %1,1f\n"
-                           "0:  " __DIAG44_INSN " 0,%4\n"
-                           "1:  slr   %0,%0\n"
-                           "    cs    %0,%1,0(%3)\n"
-                           "    jl    0b\n"
-                           : "=&d" (reg1), "=&d" (reg2), "=m" (lp->lock)
-			   : "a" (&lp->lock), "i" (__DIAG44_OPERAND),
-			     "m" (lp->lock) : "cc", "memory" );
-#endif /* __s390x__ */
+	unsigned long pc = (unsigned long) __builtin_return_address(0);
+
+	if (unlikely(_raw_compare_and_swap(&lp->lock, 0, pc) != 0))
+		_raw_spin_lock_wait(lp, pc);
 }
 
-extern inline int _raw_spin_trylock(spinlock_t *lp)
+static inline int _raw_spin_trylock(spinlock_t *lp)
 {
-	unsigned long reg;
-	unsigned int result;
-
-	__asm__ __volatile__("    basr  %1,0\n"
-			   "0:  cs    %0,%1,0(%3)"
-			   : "=d" (result), "=&d" (reg), "=m" (lp->lock)
-			   : "a" (&lp->lock), "m" (lp->lock), "0" (0)
-			   : "cc", "memory" );
-	return !result;
+	unsigned long pc = (unsigned long) __builtin_return_address(0);
+
+	if (likely(_raw_compare_and_swap(&lp->lock, 0, pc) == 0))
+		return 1;
+	return _raw_spin_trylock_retry(lp, pc);
 }
 
-extern inline void _raw_spin_unlock(spinlock_t *lp)
+static inline void _raw_spin_unlock(spinlock_t *lp)
 {
-	unsigned int old;
-
-	__asm__ __volatile__("cs %0,%3,0(%4)"
-			   : "=d" (old), "=m" (lp->lock)
-			   : "0" (lp->lock), "d" (0), "a" (lp)
-			   : "cc", "memory" );
+	_raw_compare_and_swap(&lp->lock, lp->lock, 0);
 }
 		
 /*
@@ -106,7 +78,7 @@ extern inline void _raw_spin_unlock(spinlock_t *lp)
  * read-locks.
  */
 typedef struct {
-	volatile unsigned long lock;
+	volatile unsigned int lock;
 	volatile unsigned long owner_pc;
 #ifdef CONFIG_PREEMPT
 	unsigned int break_lock;
@@ -129,123 +101,55 @@ typedef struct {
  */
 #define write_can_lock(x) ((x)->lock == 0)
 
-#ifndef __s390x__
-#define _raw_read_lock(rw)   \
-        asm volatile("   l     2,0(%1)\n"   \
-                     "   j     1f\n"     \
-                     "0: diag  0,0,68\n" \
-                     "1: la    2,0(2)\n"     /* clear high (=write) bit */ \
-                     "   la    3,1(2)\n"     /* one more reader */ \
-                     "   cs    2,3,0(%1)\n"  /* try to write new value */ \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) : "a" (&(rw)->lock), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#else /* __s390x__ */
-#define _raw_read_lock(rw)   \
-        asm volatile("   lg    2,0(%1)\n"   \
-                     "   j     1f\n"     \
-                     "0: " __DIAG44_INSN " 0,%2\n" \
-                     "1: nihh  2,0x7fff\n" /* clear high (=write) bit */ \
-                     "   la    3,1(2)\n"   /* one more reader */  \
-                     "   csg   2,3,0(%1)\n" /* try to write new value */ \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) \
-		     : "a" (&(rw)->lock), "i" (__DIAG44_OPERAND), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#endif /* __s390x__ */
-
-#ifndef __s390x__
-#define _raw_read_unlock(rw) \
-        asm volatile("   l     2,0(%1)\n"   \
-                     "   j     1f\n"     \
-                     "0: diag  0,0,68\n" \
-                     "1: lr    3,2\n"    \
-                     "   ahi   3,-1\n"    /* one less reader */ \
-                     "   cs    2,3,0(%1)\n" \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) : "a" (&(rw)->lock), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#else /* __s390x__ */
-#define _raw_read_unlock(rw) \
-        asm volatile("   lg    2,0(%1)\n"   \
-                     "   j     1f\n"     \
-                     "0: " __DIAG44_INSN " 0,%2\n" \
-                     "1: lgr   3,2\n"    \
-                     "   bctgr 3,0\n"    /* one less reader */ \
-                     "   csg   2,3,0(%1)\n" \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) \
-		     : "a" (&(rw)->lock), "i" (__DIAG44_OPERAND), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#endif /* __s390x__ */
-
-#ifndef __s390x__
-#define _raw_write_lock(rw) \
-        asm volatile("   lhi   3,1\n"    \
-                     "   sll   3,31\n"    /* new lock value = 0x80000000 */ \
-                     "   j     1f\n"     \
-                     "0: diag  0,0,68\n" \
-                     "1: slr   2,2\n"     /* old lock value must be 0 */ \
-                     "   cs    2,3,0(%1)\n" \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) : "a" (&(rw)->lock), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#else /* __s390x__ */
-#define _raw_write_lock(rw) \
-        asm volatile("   llihh 3,0x8000\n" /* new lock value = 0x80...0 */ \
-                     "   j     1f\n"       \
-                     "0: " __DIAG44_INSN " 0,%2\n"   \
-                     "1: slgr  2,2\n"      /* old lock value must be 0 */ \
-                     "   csg   2,3,0(%1)\n" \
-                     "   jl    0b"         \
-                     : "=m" ((rw)->lock) \
-		     : "a" (&(rw)->lock), "i" (__DIAG44_OPERAND), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#endif /* __s390x__ */
-
-#ifndef __s390x__
-#define _raw_write_unlock(rw) \
-        asm volatile("   slr   3,3\n"     /* new lock value = 0 */ \
-                     "   j     1f\n"     \
-                     "0: diag  0,0,68\n" \
-                     "1: lhi   2,1\n"    \
-                     "   sll   2,31\n"    /* old lock value must be 0x80000000 */ \
-                     "   cs    2,3,0(%1)\n" \
-                     "   jl    0b"       \
-                     : "=m" ((rw)->lock) : "a" (&(rw)->lock), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#else /* __s390x__ */
-#define _raw_write_unlock(rw) \
-        asm volatile("   slgr  3,3\n"      /* new lock value = 0 */ \
-                     "   j     1f\n"       \
-                     "0: " __DIAG44_INSN " 0,%2\n"   \
-                     "1: llihh 2,0x8000\n" /* old lock value must be 0x8..0 */\
-                     "   csg   2,3,0(%1)\n"   \
-                     "   jl    0b"         \
-                     : "=m" ((rw)->lock) \
-		     : "a" (&(rw)->lock), "i" (__DIAG44_OPERAND), \
-		       "m" ((rw)->lock) : "2", "3", "cc", "memory" )
-#endif /* __s390x__ */
-
-#define _raw_read_trylock(lock) generic_raw_read_trylock(lock)
-
-extern inline int _raw_write_trylock(rwlock_t *rw)
+extern void _raw_read_lock_wait(rwlock_t *lp);
+extern int _raw_read_trylock_retry(rwlock_t *lp);
+extern void _raw_write_lock_wait(rwlock_t *lp);
+extern int _raw_write_trylock_retry(rwlock_t *lp);
+
+static inline void _raw_read_lock(rwlock_t *rw)
+{
+	unsigned int old;
+	old = rw->lock & 0x7fffffffU;
+	if (_raw_compare_and_swap(&rw->lock, old, old + 1) != old)
+		_raw_read_lock_wait(rw);
+}
+
+static inline void _raw_read_unlock(rwlock_t *rw)
+{
+	unsigned int old, cmp;
+
+	old = rw->lock;
+	do {
+		cmp = old;
+		old = _raw_compare_and_swap(&rw->lock, old, old - 1);
+	} while (cmp != old);
+}
+
+static inline void _raw_write_lock(rwlock_t *rw)
+{
+	if (unlikely(_raw_compare_and_swap(&rw->lock, 0, 0x80000000) != 0))
+		_raw_write_lock_wait(rw);
+}
+
+static inline void _raw_write_unlock(rwlock_t *rw)
+{
+	_raw_compare_and_swap(&rw->lock, 0x80000000, 0);
+}
+
+static inline int _raw_read_trylock(rwlock_t *rw)
+{
+	unsigned int old;
+	old = rw->lock & 0x7fffffffU;
+	if (likely(_raw_compare_and_swap(&rw->lock, old, old + 1) == old))
+		return 1;
+	return _raw_read_trylock_retry(rw);
+}
+
+static inline int _raw_write_trylock(rwlock_t *rw)
 {
-	unsigned long result, reg;
-	
-	__asm__ __volatile__(
-#ifndef __s390x__
-			     "   lhi  %1,1\n"
-			     "   sll  %1,31\n"
-			     "   cs   %0,%1,0(%3)"
-#else /* __s390x__ */
-			     "   llihh %1,0x8000\n"
-			     "0: csg %0,%1,0(%3)\n"
-#endif /* __s390x__ */
-			     : "=d" (result), "=&d" (reg), "=m" (rw->lock)
-			     : "a" (&rw->lock), "m" (rw->lock), "0" (0UL)
-			     : "cc", "memory" );
-	return result == 0;
+	if (likely(_raw_compare_and_swap(&rw->lock, 0, 0x80000000) == 0))
+		return 1;
+	return _raw_write_trylock_retry(rw);
 }
 
 #endif /* __ASM_SPINLOCK_H */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index bfbbe94b297d..e82be96d4906 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -145,6 +145,7 @@ enum
 	KERN_BOOTLOADER_TYPE=67, /* int: boot loader type */
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
 	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
+	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e60b9c36f1f0..3e0bbee549ea 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,7 @@ extern int unaligned_enabled;
 extern int sysctl_ieee_emulation_warnings;
 #endif
 extern int sysctl_userprocess_debug;
+extern int spin_retry;
 #endif
 
 extern int sysctl_hz_timer;
@@ -647,7 +648,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-
+#if defined(CONFIG_ARCH_S390)
+	{
+		.ctl_name	= KERN_SPIN_RETRY,
+		.procname	= "spin_retry",
+		.data		= &spin_retry,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From c293621bbf678a3d85e3ed721c3921c8a670610d Mon Sep 17 00:00:00 2001
From: Peter Staubach <staubach@redhat.com>
Date: Wed, 27 Jul 2005 11:45:09 -0700
Subject: [PATCH] stale POSIX lock handling

I believe that there is a problem with the handling of POSIX locks, which
the attached patch should address.

The problem appears to be a race between fcntl(2) and close(2).  A
multithreaded application could close a file descriptor at the same time as
it is trying to acquire a lock using the same file descriptor.  I would
suggest that that multithreaded application is not providing the proper
synchronization for itself, but the OS should still behave correctly.

SUS3 (Single UNIX Specification Version 3, read: POSIX) indicates that when
a file descriptor is closed, that all POSIX locks on the file, owned by the
process which closed the file descriptor, should be released.

The trick here is when those locks are released.  The current code releases
all locks which exist when close is processing, but any locks in progress
are handled when the last reference to the open file is released.

There are three cases to consider.

One is the simple case, a multithreaded (mt) process has a file open and
races to close it and acquire a lock on it.  In this case, the close will
release one reference to the open file and when the fcntl is done, it will
release the other reference.  For this situation, no locks should exist on
the file when both the close and fcntl operations are done.  The current
system will handle this case because the last reference to the open file is
being released.

The second case is when the mt process has dup(2)'d the file descriptor.
The close will release one reference to the file and the fcntl, when done,
will release another, but there will still be at least one more reference
to the open file.  One could argue that the existence of a lock on the file
after the close has completed is okay, because it was acquired after the
close operation and there is still a way for the application to release the
lock on the file, using an existing file descriptor.

The third case is when the mt process has forked, after opening the file
and either before or after becoming an mt process.  In this case, each
process would hold a reference to the open file.  For each process, this
degenerates to first case above.  However, the lock continues to exist
until both processes have released their references to the open file.  This
lock could block other lock requests.

The changes to release the lock when the last reference to the open file
aren't quite right because they would allow the lock to exist as long as
there was a reference to the open file.  This is too long.

The new proposed solution is to add support in the fcntl code path to
detect a race with close and then to release the lock which was just
acquired when such as race is detected.  This causes locks to be released
in a timely fashion and for the system to conform to the POSIX semantic
specification.

This was tested by instrumenting a kernel to detect the handling locks and
then running a program which generates case #3 above.  A dangling lock
could be reliably generated.  When the changes to detect the close/fcntl
race were added, a dangling lock could no longer be generated.

Cc: Matthew Wilcox <willy@debian.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/fcntl.c         |  5 ++--
 fs/locks.c         | 81 ++++++++++++++++++++++++++++++++----------------------
 include/linux/fs.h |  6 ++--
 3 files changed, 55 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 286a9f8f3d49..6fbc9d8fcc36 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -288,7 +288,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		break;
 	case F_SETLK:
 	case F_SETLKW:
-		err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
+		err = fcntl_setlk(fd, filp, cmd, (struct flock __user *) arg);
 		break;
 	case F_GETOWN:
 		/*
@@ -376,7 +376,8 @@ asmlinkage long sys_fcntl64(unsigned int fd, unsigned int cmd, unsigned long arg
 			break;
 		case F_SETLK64:
 		case F_SETLKW64:
-			err = fcntl_setlk64(filp, cmd, (struct flock64 __user *) arg);
+			err = fcntl_setlk64(fd, filp, cmd,
+					(struct flock64 __user *) arg);
 			break;
 		default:
 			err = do_fcntl(fd, cmd, arg, filp);
diff --git a/fs/locks.c b/fs/locks.c
index 29fa5da6c117..11956b6179ff 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1591,7 +1591,8 @@ out:
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
-int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l)
+int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock __user *l)
 {
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock flock;
@@ -1620,6 +1621,7 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 		goto out;
 	}
 
+again:
 	error = flock_to_posix_lock(filp, file_lock, &flock);
 	if (error)
 		goto out;
@@ -1648,25 +1650,33 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL) {
+	if (filp->f_op && filp->f_op->lock != NULL)
 		error = filp->f_op->lock(filp, cmd, file_lock);
-		goto out;
-	}
+	else {
+		for (;;) {
+			error = __posix_lock_file(inode, file_lock);
+			if ((error != -EAGAIN) || (cmd == F_SETLK))
+				break;
+			error = wait_event_interruptible(file_lock->fl_wait,
+					!file_lock->fl_next);
+			if (!error)
+				continue;
 
-	for (;;) {
-		error = __posix_lock_file(inode, file_lock);
-		if ((error != -EAGAIN) || (cmd == F_SETLK))
+			locks_delete_block(file_lock);
 			break;
-		error = wait_event_interruptible(file_lock->fl_wait,
-				!file_lock->fl_next);
-		if (!error)
-			continue;
+		}
+	}
 
-		locks_delete_block(file_lock);
-		break;
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
 	}
 
- out:
+out:
 	locks_free_lock(file_lock);
 	return error;
 }
@@ -1724,7 +1734,8 @@ out:
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
-int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
+int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+		struct flock64 __user *l)
 {
 	struct file_lock *file_lock = locks_alloc_lock();
 	struct flock64 flock;
@@ -1753,6 +1764,7 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 		goto out;
 	}
 
+again:
 	error = flock64_to_posix_lock(filp, file_lock, &flock);
 	if (error)
 		goto out;
@@ -1781,22 +1793,30 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 	if (error)
 		goto out;
 
-	if (filp->f_op && filp->f_op->lock != NULL) {
+	if (filp->f_op && filp->f_op->lock != NULL)
 		error = filp->f_op->lock(filp, cmd, file_lock);
-		goto out;
-	}
+	else {
+		for (;;) {
+			error = __posix_lock_file(inode, file_lock);
+			if ((error != -EAGAIN) || (cmd == F_SETLK64))
+				break;
+			error = wait_event_interruptible(file_lock->fl_wait,
+					!file_lock->fl_next);
+			if (!error)
+				continue;
 
-	for (;;) {
-		error = __posix_lock_file(inode, file_lock);
-		if ((error != -EAGAIN) || (cmd == F_SETLK64))
+			locks_delete_block(file_lock);
 			break;
-		error = wait_event_interruptible(file_lock->fl_wait,
-				!file_lock->fl_next);
-		if (!error)
-			continue;
+		}
+	}
 
-		locks_delete_block(file_lock);
-		break;
+	/*
+	 * Attempt to detect a close/fcntl race and recover by
+	 * releasing the lock that was just acquired.
+	 */
+	if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) {
+		flock.l_type = F_UNLCK;
+		goto again;
 	}
 
 out:
@@ -1888,12 +1908,7 @@ void locks_remove_flock(struct file *filp)
 
 	while ((fl = *before) != NULL) {
 		if (fl->fl_file == filp) {
-			/*
-			 * We might have a POSIX lock that was created at the same time
-			 * the filp was closed for the last time. Just remove that too,
-			 * regardless of ownership, since nobody can own it.
-			 */
-			if (IS_FLOCK(fl) || IS_POSIX(fl)) {
+			if (IS_FLOCK(fl)) {
 				locks_delete_lock(before);
 				continue;
 			}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0f53e0124941..f9adf75fd9b4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -697,11 +697,13 @@ extern struct list_head file_lock_list;
 #include <linux/fcntl.h>
 
 extern int fcntl_getlk(struct file *, struct flock __user *);
-extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *);
+extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
+			struct flock __user *);
 
 #if BITS_PER_LONG == 32
 extern int fcntl_getlk64(struct file *, struct flock64 __user *);
-extern int fcntl_setlk64(struct file *, unsigned int, struct flock64 __user *);
+extern int fcntl_setlk64(unsigned int, struct file *, unsigned int,
+			struct flock64 __user *);
 #endif
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
-- 
cgit v1.2.3-59-g8ed1b


From 8c52ab42c11b5a7fb44bb84c954d09968e90e9e7 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Wed, 27 Jul 2005 11:45:15 -0700
Subject: [PATCH] mbcache: Remove unused mb_cache_shrink parameter

The cache parameter to mb_cache_shrink isn't used.  We may as well remove
it.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext2/xattr.c         | 2 +-
 fs/ext3/xattr.c         | 2 +-
 fs/mbcache.c            | 3 +--
 include/linux/mbcache.h | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 27982b500e84..0099462d4271 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -823,7 +823,7 @@ cleanup:
 void
 ext2_xattr_put_super(struct super_block *sb)
 {
-	mb_cache_shrink(ext2_xattr_cache, sb->s_bdev);
+	mb_cache_shrink(sb->s_bdev);
 }
 
 
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 3f9dfa643b19..269c7b92db9a 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -1106,7 +1106,7 @@ cleanup:
 void
 ext3_xattr_put_super(struct super_block *sb)
 {
-	mb_cache_shrink(ext3_xattr_cache, sb->s_bdev);
+	mb_cache_shrink(sb->s_bdev);
 }
 
 /*
diff --git a/fs/mbcache.c b/fs/mbcache.c
index c7170b9221a3..b002a088857d 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -316,11 +316,10 @@ fail:
  * currently in use cannot be freed, and thus remain in the cache. All others
  * are freed.
  *
- * @cache: which cache to shrink
  * @bdev: which device's cache entries to shrink
  */
 void
-mb_cache_shrink(struct mb_cache *cache, struct block_device *bdev)
+mb_cache_shrink(struct block_device *bdev)
 {
 	LIST_HEAD(free_list);
 	struct list_head *l, *ltmp;
diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h
index 8e5a10410a30..9263d2db2d67 100644
--- a/include/linux/mbcache.h
+++ b/include/linux/mbcache.h
@@ -29,7 +29,7 @@ struct mb_cache_op {
 
 struct mb_cache * mb_cache_create(const char *, struct mb_cache_op *, size_t,
 				  int, int);
-void mb_cache_shrink(struct mb_cache *, struct block_device *);
+void mb_cache_shrink(struct block_device *);
 void mb_cache_destroy(struct mb_cache *);
 
 /* Functions on cache entries */
-- 
cgit v1.2.3-59-g8ed1b


From 44456d37b59d8e541936ed26d8b6e08d27e88ac1 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Wed, 27 Jul 2005 11:45:17 -0700
Subject: [PATCH] turn many #if $undefined_string into #ifdef $undefined_string

turn many #if $undefined_string into #ifdef $undefined_string to fix some
warnings after -Wno-def was added to global CFLAGS

Signed-off-by: Olaf Hering <olh@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/udbg.c             |  2 +-
 arch/um/drivers/cow.h                |  4 ++--
 arch/um/kernel/skas/syscall_user.c   |  4 ++--
 arch/um/os-Linux/elf_aux.c           |  1 +
 arch/x86_64/ia32/ia32_aout.c         |  6 +++---
 arch/x86_64/kernel/smpboot.c         | 10 +++++-----
 drivers/block/sx8.c                  |  4 ++--
 drivers/cdrom/mcdx.c                 |  8 ++++----
 drivers/char/rio/rioboot.c           | 12 ++++++------
 drivers/char/rio/rioroute.c          |  2 +-
 drivers/char/rio/riotable.c          |  2 +-
 drivers/ieee1394/sbp2.c              |  1 +
 drivers/isdn/hisax/l3dss1.c          |  8 ++++----
 drivers/md/bitmap.c                  |  8 ++++----
 drivers/mtd/devices/docecc.c         |  1 +
 drivers/net/8139too.c                |  6 +++---
 drivers/net/amd8111e.c               |  2 +-
 drivers/net/ne.c                     |  4 ++--
 drivers/scsi/NCR53c406a.c            |  4 ++--
 drivers/scsi/aic7xxx/aic79xx_osm.c   |  2 +-
 drivers/scsi/aic7xxx/aic79xx_pci.c   |  2 +-
 drivers/scsi/dpt/dptsig.h            |  4 ++--
 drivers/scsi/dtc.c                   |  4 ----
 drivers/scsi/dtc.h                   |  4 ++++
 drivers/scsi/initio.c                |  2 +-
 drivers/scsi/lpfc/lpfc_compat.h      |  3 ++-
 drivers/scsi/lpfc/lpfc_scsi.h        |  4 +++-
 drivers/scsi/pas16.c                 |  1 +
 drivers/scsi/sym53c8xx_2/sym_hipd.h  | 16 ++++++++++------
 drivers/scsi/sym53c8xx_2/sym_nvram.c |  2 +-
 drivers/scsi/t128.h                  |  1 +
 drivers/video/riva/fbdev.c           |  2 +-
 fs/ntfs/sysctl.h                     |  2 +-
 include/linux/ftape.h                |  2 +-
 net/ipv6/ip6_output.c                |  7 +------
 sound/isa/sb/sb_mixer.c              |  4 ++--
 sound/oss/pss.c                      |  2 +-
 sound/pci/rme9652/rme9652.c          |  2 +-
 38 files changed, 81 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc64/kernel/udbg.c b/arch/ppc64/kernel/udbg.c
index d4ccd6f1ef47..c0da45540f0f 100644
--- a/arch/ppc64/kernel/udbg.c
+++ b/arch/ppc64/kernel/udbg.c
@@ -141,7 +141,7 @@ void udbg_init_scc(struct device_node *np)
 
 #endif /* CONFIG_PPC_PMAC */
 
-#if CONFIG_PPC_PMAC
+#ifdef CONFIG_PPC_PMAC
 static void udbg_real_putc(unsigned char c)
 {
 	while ((real_readb(sccc) & SCC_TXRDY) == 0)
diff --git a/arch/um/drivers/cow.h b/arch/um/drivers/cow.h
index 4fcbe8b1b77e..4fcf3a8d13f4 100644
--- a/arch/um/drivers/cow.h
+++ b/arch/um/drivers/cow.h
@@ -3,10 +3,10 @@
 
 #include <asm/types.h>
 
-#if __BYTE_ORDER == __BIG_ENDIAN
+#if defined(__BIG_ENDIAN)
 # define ntohll(x) (x)
 # define htonll(x) (x)
-#elif __BYTE_ORDER == __LITTLE_ENDIAN
+#elif defined(__LITTLE_ENDIAN)
 # define ntohll(x)  bswap_64(x)
 # define htonll(x)  bswap_64(x)
 #else
diff --git a/arch/um/kernel/skas/syscall_user.c b/arch/um/kernel/skas/syscall_user.c
index 2828e6e37721..6b0664970147 100644
--- a/arch/um/kernel/skas/syscall_user.c
+++ b/arch/um/kernel/skas/syscall_user.c
@@ -15,7 +15,7 @@
 void handle_syscall(union uml_pt_regs *regs)
 {
 	long result;
-#if UML_CONFIG_SYSCALL_DEBUG
+#ifdef UML_CONFIG_SYSCALL_DEBUG
   	int index;
 
   	index = record_syscall_start(UPT_SYSCALL_NR(regs));
@@ -27,7 +27,7 @@ void handle_syscall(union uml_pt_regs *regs)
 	REGS_SET_SYSCALL_RETURN(regs->skas.regs, result);
 
 	syscall_trace(regs, 1);
-#if UML_CONFIG_SYSCALL_DEBUG
+#ifdef UML_CONFIG_SYSCALL_DEBUG
   	record_syscall_end(index, result);
 #endif
 }
diff --git a/arch/um/os-Linux/elf_aux.c b/arch/um/os-Linux/elf_aux.c
index f0d6060e3e57..5423b1ca17c4 100644
--- a/arch/um/os-Linux/elf_aux.c
+++ b/arch/um/os-Linux/elf_aux.c
@@ -11,6 +11,7 @@
 #include <stddef.h>
 #include "init.h"
 #include "elf_user.h"
+#include <asm/elf.h>
 
 #if ELF_CLASS == ELFCLASS32
 typedef Elf32_auxv_t elf_auxv_t;
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index c12edf5d97f0..3e6780fa0186 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -42,7 +42,7 @@ extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
 
-#if CORE_DUMP
+#ifdef CORE_DUMP
 static int aout_core_dump(long signr, struct pt_regs * regs, struct file *file);
 
 /*
@@ -103,7 +103,7 @@ static struct linux_binfmt aout_format = {
 	.module		= THIS_MODULE,
 	.load_binary	= load_aout_binary,
 	.load_shlib	= load_aout_library,
-#if CORE_DUMP
+#ifdef CORE_DUMP
 	.core_dump	= aout_core_dump,
 #endif
 	.min_coredump	= PAGE_SIZE
@@ -120,7 +120,7 @@ static void set_brk(unsigned long start, unsigned long end)
 	up_write(&current->mm->mmap_sem);
 }
 
-#if CORE_DUMP
+#ifdef CORE_DUMP
 /*
  * These are the only things you should do on a core-file: use only these
  * macros to write out all the necessary info.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index e66edfa1f3b9..e773a794ec45 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -285,7 +285,7 @@ static __cpuinit void sync_tsc(void)
 	int i, done = 0;
 	long delta, adj, adjust_latency = 0;
 	unsigned long flags, rt, master_time_stamp, bound;
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 	static struct syncdebug {
 		long rt;	/* roundtrip time */
 		long master;	/* master's timestamp */
@@ -321,7 +321,7 @@ static __cpuinit void sync_tsc(void)
 				rdtscll(t);
 				wrmsrl(MSR_IA32_TSC, t + adj);
 			}
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 			t[i].rt = rt;
 			t[i].master = master_time_stamp;
 			t[i].diff = delta;
@@ -331,7 +331,7 @@ static __cpuinit void sync_tsc(void)
 	}
 	spin_unlock_irqrestore(&tsc_sync_lock, flags);
 
-#if DEBUG_TSC_SYNC
+#ifdef DEBUG_TSC_SYNC
 	for (i = 0; i < NUM_ROUNDS; ++i)
 		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
 		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
@@ -537,7 +537,7 @@ void __cpuinit start_secondary(void)
 extern volatile unsigned long init_rsp;
 extern void (*initial_code)(void);
 
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
 static void inquire_remote_apic(int apicid)
 {
 	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
@@ -841,7 +841,7 @@ do_rest:
 			else
 				/* trampoline code not run */
 				printk("Not responding.\n");
-#if APIC_DEBUG
+#ifdef APIC_DEBUG
 			inquire_remote_apic(apicid);
 #endif
 		}
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 9db0a9e3e59c..d57007b92f77 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -1582,7 +1582,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (rc)
 		goto err_out;
 
-#if IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
+#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
 	rc = pci_set_dma_mask(pdev, DMA_64BIT_MASK);
 	if (!rc) {
 		rc = pci_set_consistent_dma_mask(pdev, DMA_64BIT_MASK);
@@ -1601,7 +1601,7 @@ static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 			goto err_out_regions;
 		}
 		pci_dac = 0;
-#if IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
+#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
 	}
 #endif
 
diff --git a/drivers/cdrom/mcdx.c b/drivers/cdrom/mcdx.c
index 07bbd24e3c18..b89420e6d704 100644
--- a/drivers/cdrom/mcdx.c
+++ b/drivers/cdrom/mcdx.c
@@ -51,7 +51,7 @@
  */
 
 
-#if RCS
+#ifdef RCS
 static const char *mcdx_c_version
     = "$Id: mcdx.c,v 1.21 1997/01/26 07:12:59 davem Exp $";
 #endif
@@ -706,7 +706,7 @@ static int mcdx_open(struct cdrom_device_info *cdi, int purpose)
 		xtrace(OPENCLOSE, "open() init irq generation\n");
 		if (-1 == mcdx_config(stuffp, 1))
 			return -EIO;
-#if FALLBACK
+#ifdef FALLBACK
 		/* Set the read speed */
 		xwarn("AAA %x AAA\n", stuffp->readcmd);
 		if (stuffp->readerrs)
@@ -1216,7 +1216,7 @@ static int __init mcdx_init_drive(int drive)
 	}
 
 
-#if WE_KNOW_WHY
+#ifdef WE_KNOW_WHY
 	/* irq 11 -> channel register */
 	outb(0x50, stuffp->wreg_chn);
 #endif
@@ -1294,7 +1294,7 @@ static int mcdx_transfer(struct s_drive_stuff *stuffp,
 
 	ans = mcdx_xfer(stuffp, p, sector, nr_sectors);
 	return ans;
-#if FALLBACK
+#ifdef FALLBACK
 	if (-1 == ans)
 		stuffp->readerrs++;
 	else
diff --git a/drivers/char/rio/rioboot.c b/drivers/char/rio/rioboot.c
index a8be11dfcba3..34cbb13aad4b 100644
--- a/drivers/char/rio/rioboot.c
+++ b/drivers/char/rio/rioboot.c
@@ -902,7 +902,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
 	       (HostP->Mapping[entry].RtaUniqueNum==RtaUniq))
 	    {
 	        HostP->Mapping[entry].Flags |= RTA_BOOTED|RTA_NEWBOOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 		RIO_SV_BROADCAST(HostP->svFlags[entry]);
 #endif
 		if ( (sysport=HostP->Mapping[entry].SysPort) != NO_PORT )
@@ -918,7 +918,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
 		   {
 			entry2 = HostP->Mapping[entry].ID2 - 1;
 			HostP->Mapping[entry2].Flags |= RTA_BOOTED|RTA_NEWBOOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 			RIO_SV_BROADCAST(HostP->svFlags[entry2]);
 #endif
 			sysport = HostP->Mapping[entry2].SysPort;
@@ -1143,7 +1143,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
 		    CCOPY( MapP->Name, HostP->Mapping[entry].Name, MAX_NAME_LEN );
 		    HostP->Mapping[entry].Flags =
 		     SLOT_IN_USE | RTA_BOOTED | RTA_NEWBOOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 		    RIO_SV_BROADCAST(HostP->svFlags[entry]);
 #endif
 		    RIOReMapPorts( p, HostP, &HostP->Mapping[entry] );
@@ -1159,7 +1159,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
    "This RTA has a tentative entry on another host - delete that entry (1)\n");
 		    HostP->Mapping[entry].Flags =
 		     SLOT_TENTATIVE | RTA_BOOTED | RTA_NEWBOOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 		    RIO_SV_BROADCAST(HostP->svFlags[entry]);
 #endif
 		}
@@ -1169,7 +1169,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
 		    {
 			HostP->Mapping[entry2].Flags = SLOT_IN_USE |
 			 RTA_BOOTED | RTA_NEWBOOT | RTA16_SECOND_SLOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 			RIO_SV_BROADCAST(HostP->svFlags[entry2]);
 #endif
 			HostP->Mapping[entry2].SysPort = MapP2->SysPort;
@@ -1188,7 +1188,7 @@ static int RIOBootComplete( struct rio_info *p, struct Host *HostP, uint Rup, st
 		    else
 			HostP->Mapping[entry2].Flags = SLOT_TENTATIVE |
 			 RTA_BOOTED | RTA_NEWBOOT | RTA16_SECOND_SLOT;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 			RIO_SV_BROADCAST(HostP->svFlags[entry2]);
 #endif
 		    bzero( (caddr_t)MapP2, sizeof(struct Map) );
diff --git a/drivers/char/rio/rioroute.c b/drivers/char/rio/rioroute.c
index 106b31f48a21..e9564c9fb37c 100644
--- a/drivers/char/rio/rioroute.c
+++ b/drivers/char/rio/rioroute.c
@@ -1023,7 +1023,7 @@ RIOFreeDisconnected(struct rio_info *p, struct Host *HostP, int unit)
     if (link < LINKS_PER_UNIT)
 	    return 1;
 
-#if NEED_TO_FIX_THIS
+#ifdef NEED_TO_FIX_THIS
     /* Ok so all the links are disconnected. But we may have only just
     ** made this slot tentative and not yet received a topology update.
     ** Lets check how long ago we made it tentative.
diff --git a/drivers/char/rio/riotable.c b/drivers/char/rio/riotable.c
index 8fb26ad2aa12..e45bc275907a 100644
--- a/drivers/char/rio/riotable.c
+++ b/drivers/char/rio/riotable.c
@@ -771,7 +771,7 @@ int RIOAssignRta( struct rio_info *p, struct Map *MapP )
 	    if ((MapP->Flags & RTA16_SECOND_SLOT) == 0)
 	      CCOPY( MapP->Name, HostMapP->Name, MAX_NAME_LEN );
 	    HostMapP->Flags = SLOT_IN_USE | RTA_BOOTED;
-#if NEED_TO_FIX
+#ifdef NEED_TO_FIX
 	    RIO_SV_BROADCAST(p->RIOHosts[host].svFlags[MapP->ID-1]);
 #endif
 	    if (MapP->Flags & RTA16_SECOND_SLOT)
diff --git a/drivers/ieee1394/sbp2.c b/drivers/ieee1394/sbp2.c
index fe3e1703fa61..627af507643a 100644
--- a/drivers/ieee1394/sbp2.c
+++ b/drivers/ieee1394/sbp2.c
@@ -169,6 +169,7 @@ MODULE_DEVICE_TABLE(ieee1394, sbp2_id_table);
  * Debug levels, configured via kernel config, or enable here.
  */
 
+#define CONFIG_IEEE1394_SBP2_DEBUG 0
 /* #define CONFIG_IEEE1394_SBP2_DEBUG_ORBS */
 /* #define CONFIG_IEEE1394_SBP2_DEBUG_DMA */
 /* #define CONFIG_IEEE1394_SBP2_DEBUG 1 */
diff --git a/drivers/isdn/hisax/l3dss1.c b/drivers/isdn/hisax/l3dss1.c
index a6d2abdb478a..e96845cdd4f6 100644
--- a/drivers/isdn/hisax/l3dss1.c
+++ b/drivers/isdn/hisax/l3dss1.c
@@ -353,7 +353,7 @@ l3dss1_parse_facility(struct PStack *st, struct l3_process *pc,
 			         { l3dss1_dummy_invoke(st, cr, id, ident, p, nlen);
                                    return;
                                  } 
-#if HISAX_DE_AOC
+#ifdef HISAX_DE_AOC
 			{
 
 #define FOO1(s,a,b) \
@@ -977,7 +977,7 @@ l3dss1_release_cmpl(struct l3_process *pc, u_char pr, void *arg)
 	dss1_release_l3_process(pc);
 }
 
-#if EXT_BEARER_CAPS
+#ifdef EXT_BEARER_CAPS
 
 static u_char *
 EncodeASyncParams(u_char * p, u_char si2)
@@ -1369,7 +1369,7 @@ l3dss1_setup_req(struct l3_process *pc, u_char pr,
 				*p++ = *sub++ & 0x7f;
 		}
         }
-#if EXT_BEARER_CAPS
+#ifdef EXT_BEARER_CAPS
 	if ((pc->para.setup.si2 >= 160) && (pc->para.setup.si2 <= 175)) {	// sync. Bitratenadaption, V.110/X.30
 
 		*p++ = IE_LLC;
@@ -1609,7 +1609,7 @@ l3dss1_setup(struct l3_process *pc, u_char pr, void *arg)
 				case 0x08: /* Unrestricted digital information */
 					pc->para.setup.si1 = 7;
 /* JIM, 05.11.97 I wanna set service indicator 2 */
-#if EXT_BEARER_CAPS
+#ifdef EXT_BEARER_CAPS
 					pc->para.setup.si2 = DecodeSI2(skb);
 #endif
 					break;
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 0c2ed99a3832..70bca955e0de 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -108,7 +108,7 @@ static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
 {
 	unsigned char *page;
 
-#if INJECT_FAULTS_1
+#ifdef INJECT_FAULTS_1
 	page = NULL;
 #else
 	page = kmalloc(PAGE_SIZE, GFP_NOIO);
@@ -843,7 +843,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, int in_sync)
 
 	BUG_ON(!file && !bitmap->offset);
 
-#if INJECT_FAULTS_3
+#ifdef INJECT_FAULTS_3
 	outofdate = 1;
 #else
 	outofdate = bitmap->flags & BITMAP_STALE;
@@ -1187,7 +1187,7 @@ static int bitmap_start_daemon(struct bitmap *bitmap, mdk_thread_t **ptr,
 
 	spin_unlock_irqrestore(&bitmap->lock, flags);
 
-#if INJECT_FATAL_FAULT_2
+#ifdef INJECT_FATAL_FAULT_2
 	daemon = NULL;
 #else
 	sprintf(namebuf, "%%s_%s", name);
@@ -1552,7 +1552,7 @@ int bitmap_create(mddev_t *mddev)
 
 	bitmap->syncchunk = ~0UL;
 
-#if INJECT_FATAL_FAULT_1
+#ifdef INJECT_FATAL_FAULT_1
 	bitmap->bp = NULL;
 #else
 	bitmap->bp = kmalloc(pages * sizeof(*bitmap->bp), GFP_KERNEL);
diff --git a/drivers/mtd/devices/docecc.c b/drivers/mtd/devices/docecc.c
index 933877ff4d88..9a087c1fb0b7 100644
--- a/drivers/mtd/devices/docecc.c
+++ b/drivers/mtd/devices/docecc.c
@@ -40,6 +40,7 @@
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/doc2000.h>
 
+#define DEBUG 0
 /* need to undef it (from asm/termbits.h) */
 #undef B0
 
diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index 5a4a08a7c951..4c2cf7bbd252 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -126,14 +126,14 @@
 #define USE_IO_OPS 1
 #endif
 
-/* define to 1 to enable copious debugging info */
-#undef RTL8139_DEBUG
+/* define to 1, 2 or 3 to enable copious debugging info */
+#define RTL8139_DEBUG 0
 
 /* define to 1 to disable lightweight runtime debugging checks */
 #undef RTL8139_NDEBUG
 
 
-#ifdef RTL8139_DEBUG
+#if RTL8139_DEBUG
 /* note: prints function name for you */
 #  define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__ , ## args)
 #else
diff --git a/drivers/net/amd8111e.c b/drivers/net/amd8111e.c
index 8618012df06a..d9ba8be72af8 100755
--- a/drivers/net/amd8111e.c
+++ b/drivers/net/amd8111e.c
@@ -1290,7 +1290,7 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id, struct pt_regs *reg
 	writel(intr0, mmio + INT0);
 
 	/* Check if Receive Interrupt has occurred. */
-#if CONFIG_AMD8111E_NAPI
+#ifdef CONFIG_AMD8111E_NAPI
 	if(intr0 & RINT0){
 		if(netif_rx_schedule_prep(dev)){
 			/* Disable receive interupts */
diff --git a/drivers/net/ne.c b/drivers/net/ne.c
index 6c57096aa2e1..d209a1556b2e 100644
--- a/drivers/net/ne.c
+++ b/drivers/net/ne.c
@@ -129,9 +129,9 @@ bad_clone_list[] __initdata = {
 #define NESM_START_PG	0x40	/* First page of TX buffer */
 #define NESM_STOP_PG	0x80	/* Last page +1 of RX ring */
 
-#ifdef CONFIG_PLAT_MAPPI
+#if defined(CONFIG_PLAT_MAPPI)
 #  define DCR_VAL 0x4b
-#elif CONFIG_PLAT_OAKS32R
+#elif defined(CONFIG_PLAT_OAKS32R)
 #  define DCR_VAL 0x48
 #else
 #  define DCR_VAL 0x49
diff --git a/drivers/scsi/NCR53c406a.c b/drivers/scsi/NCR53c406a.c
index b2002ba6e2aa..79ae73b23680 100644
--- a/drivers/scsi/NCR53c406a.c
+++ b/drivers/scsi/NCR53c406a.c
@@ -182,13 +182,13 @@ static int irq_probe(void);
 static void *bios_base;
 #endif
 
-#if PORT_BASE
+#ifdef PORT_BASE
 static int port_base = PORT_BASE;
 #else
 static int port_base;
 #endif
 
-#if IRQ_LEV
+#ifdef IRQ_LEV
 static int irq_level = IRQ_LEV;
 #else
 static int irq_level = -1;	/* 0 is 'no irq', so use -1 for 'uninitialized' */
diff --git a/drivers/scsi/aic7xxx/aic79xx_osm.c b/drivers/scsi/aic7xxx/aic79xx_osm.c
index 6466a184a141..329cb2331339 100644
--- a/drivers/scsi/aic7xxx/aic79xx_osm.c
+++ b/drivers/scsi/aic7xxx/aic79xx_osm.c
@@ -1505,7 +1505,7 @@ ahd_linux_dev_reset(Scsi_Cmnd *cmd)
 	memset(recovery_cmd, 0, sizeof(struct scsi_cmnd));
 	recovery_cmd->device = cmd->device;
 	recovery_cmd->scsi_done = ahd_linux_dev_reset_complete;
-#if AHD_DEBUG
+#ifdef AHD_DEBUG
 	if ((ahd_debug & AHD_SHOW_RECOVERY) != 0)
 		printf("%s:%d:%d:%d: Device reset called for cmd %p\n",
 		       ahd_name(ahd), cmd->device->channel, cmd->device->id,
diff --git a/drivers/scsi/aic7xxx/aic79xx_pci.c b/drivers/scsi/aic7xxx/aic79xx_pci.c
index 4c3bb7bb8420..703f6e44889d 100644
--- a/drivers/scsi/aic7xxx/aic79xx_pci.c
+++ b/drivers/scsi/aic7xxx/aic79xx_pci.c
@@ -582,7 +582,7 @@ ahd_check_extport(struct ahd_softc *ahd)
 		}
 	}
 
-#if AHD_DEBUG
+#ifdef AHD_DEBUG
 	if (have_seeprom != 0
 	 && (ahd_debug & AHD_DUMP_SEEPROM) != 0) {
 		uint16_t *sc_data;
diff --git a/drivers/scsi/dpt/dptsig.h b/drivers/scsi/dpt/dptsig.h
index 95a4cce6c892..4bf447792129 100644
--- a/drivers/scsi/dpt/dptsig.h
+++ b/drivers/scsi/dpt/dptsig.h
@@ -76,7 +76,7 @@ typedef unsigned long sigLONG;
 #endif  /* aix */
 #endif
 /* For the Macintosh */
-#if STRUCTALIGNMENTSUPPORTED
+#ifdef STRUCTALIGNMENTSUPPORTED
 #pragma options align=mac68k
 #endif
 
@@ -332,7 +332,7 @@ typedef struct dpt_sig {
 #endif  /* aix */
 #endif
 /* For the Macintosh */
-#if STRUCTALIGNMENTSUPPORTED
+#ifdef STRUCTALIGNMENTSUPPORTED
 #pragma options align=reset
 #endif
 
diff --git a/drivers/scsi/dtc.c b/drivers/scsi/dtc.c
index ab9de39bb50b..897743b23342 100644
--- a/drivers/scsi/dtc.c
+++ b/drivers/scsi/dtc.c
@@ -92,10 +92,6 @@
 
 #define DTC_PUBLIC_RELEASE 2
 
-/*#define DTCDEBUG 0x1*/
-#define DTCDEBUG_INIT	0x1
-#define DTCDEBUG_TRANSFER 0x2
-
 /*
  * The DTC3180 & 3280 boards are memory mapped.
  * 
diff --git a/drivers/scsi/dtc.h b/drivers/scsi/dtc.h
index ed73629eb2f9..277cd015ee4e 100644
--- a/drivers/scsi/dtc.h
+++ b/drivers/scsi/dtc.h
@@ -28,6 +28,10 @@
 #ifndef DTC3280_H
 #define DTC3280_H
 
+#define DTCDEBUG 0
+#define DTCDEBUG_INIT	0x1
+#define DTCDEBUG_TRANSFER 0x2
+
 static int dtc_abort(Scsi_Cmnd *);
 static int dtc_biosparam(struct scsi_device *, struct block_device *,
 		         sector_t, int*);
diff --git a/drivers/scsi/initio.c b/drivers/scsi/initio.c
index 2094d4811d61..ea6f3c0e05d9 100644
--- a/drivers/scsi/initio.c
+++ b/drivers/scsi/initio.c
@@ -716,7 +716,7 @@ static int init_tulip(HCS * pCurHcb, SCB * scbp, int tul_num_scb,
 	pCurHcb->HCS_SCSI_ID = i91unvramp->NVM_SCSIInfo[0].NVM_ChSCSIID;
 	pCurHcb->HCS_IdMask = ~(1 << pCurHcb->HCS_SCSI_ID);
 
-#if CHK_PARITY
+#ifdef CHK_PARITY
 	/* Enable parity error response */
 	TUL_WR(pCurHcb->HCS_Base + TUL_PCMD, TUL_RD(pCurHcb->HCS_Base, TUL_PCMD) | 0x40);
 #endif
diff --git a/drivers/scsi/lpfc/lpfc_compat.h b/drivers/scsi/lpfc/lpfc_compat.h
index 275ba34b3c9d..a11f1ae7b98e 100644
--- a/drivers/scsi/lpfc/lpfc_compat.h
+++ b/drivers/scsi/lpfc/lpfc_compat.h
@@ -30,8 +30,9 @@ memcpy_toio() and memcpy_fromio() can be used.
 However on a big-endian host, copy 4 bytes at a time,
 using writel() and readl().
  *******************************************************************/
+#include <asm/byteorder.h>
 
-#if __BIG_ENDIAN
+#ifdef __BIG_ENDIAN
 
 static inline void
 lpfc_memcpy_to_slim(void __iomem *dest, void *src, unsigned int bytes)
diff --git a/drivers/scsi/lpfc/lpfc_scsi.h b/drivers/scsi/lpfc/lpfc_scsi.h
index d8fd2010ef41..0fd9ba14e1b5 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.h
+++ b/drivers/scsi/lpfc/lpfc_scsi.h
@@ -18,6 +18,8 @@
  * included with this package.                                     *
  *******************************************************************/
 
+#include <asm/byteorder.h>
+
 struct lpfc_hba;
 
 #define list_remove_head(list, entry, type, member)		\
@@ -81,7 +83,7 @@ struct fcp_cmnd {
 	/* # of bits to shift lun id to end up in right
 	 * payload word, little endian = 8, big = 16.
 	 */
-#if __BIG_ENDIAN
+#ifdef __BIG_ENDIAN
 #define FC_LUN_SHIFT         16
 #define FC_ADDR_MODE_SHIFT   24
 #else	/*  __LITTLE_ENDIAN */
diff --git a/drivers/scsi/pas16.c b/drivers/scsi/pas16.c
index 363e0ebd4a39..72bc947e45b6 100644
--- a/drivers/scsi/pas16.c
+++ b/drivers/scsi/pas16.c
@@ -2,6 +2,7 @@
 #define PSEUDO_DMA
 #define FOO
 #define UNSAFE  /* Not unsafe for PAS16 -- use it */
+#define PDEBUG 0
 
 /*
  * This driver adapted from Drew Eckhardt's Trantor T128 driver
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.h b/drivers/scsi/sym53c8xx_2/sym_hipd.h
index c55c7a57afa0..3131a6bf7ab7 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.h
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.h
@@ -151,6 +151,16 @@
  */
 #define	SYM_CONF_MIN_ASYNC (40)
 
+
+/*
+ * MEMORY ALLOCATOR.
+ */
+
+#define SYM_MEM_WARN	1	/* Warn on failed operations */
+
+#define SYM_MEM_PAGE_ORDER 0	/* 1 PAGE  maximum */
+#define SYM_MEM_CLUSTER_SHIFT	(PAGE_SHIFT+SYM_MEM_PAGE_ORDER)
+#define SYM_MEM_FREE_UNUSED	/* Free unused pages immediately */
 /*
  *  Shortest memory chunk is (1<<SYM_MEM_SHIFT), currently 16.
  *  Actual allocations happen as SYM_MEM_CLUSTER_SIZE sized.
@@ -1192,12 +1202,6 @@ static inline void sym_setup_data_pointers(struct sym_hcb *np,
  *  MEMORY ALLOCATOR.
  */
 
-#define SYM_MEM_PAGE_ORDER 0	/* 1 PAGE  maximum */
-#define SYM_MEM_CLUSTER_SHIFT	(PAGE_SHIFT+SYM_MEM_PAGE_ORDER)
-#define SYM_MEM_FREE_UNUSED	/* Free unused pages immediately */
-
-#define SYM_MEM_WARN	1	/* Warn on failed operations */
-
 #define sym_get_mem_cluster()	\
 	(void *) __get_free_pages(GFP_ATOMIC, SYM_MEM_PAGE_ORDER)
 #define sym_free_mem_cluster(p)	\
diff --git a/drivers/scsi/sym53c8xx_2/sym_nvram.c b/drivers/scsi/sym53c8xx_2/sym_nvram.c
index cd9140e158cf..994b7566bcac 100644
--- a/drivers/scsi/sym53c8xx_2/sym_nvram.c
+++ b/drivers/scsi/sym53c8xx_2/sym_nvram.c
@@ -367,7 +367,7 @@ static void S24C16_read_byte(struct sym_device *np, u_char *read_data, u_char ac
 	S24C16_write_ack(np, ack_data, gpreg, gpcntl);
 }
 
-#if SYM_CONF_NVRAM_WRITE_SUPPORT
+#ifdef SYM_CONF_NVRAM_WRITE_SUPPORT
 /*
  *  Write 'len' bytes starting at 'offset'.
  */
diff --git a/drivers/scsi/t128.h b/drivers/scsi/t128.h
index 9ad1d68827a7..596f3a32a1c6 100644
--- a/drivers/scsi/t128.h
+++ b/drivers/scsi/t128.h
@@ -43,6 +43,7 @@
 
 #define T128_PUBLIC_RELEASE 3
 
+#define TDEBUG		0
 #define TDEBUG_INIT	0x1
 #define TDEBUG_TRANSFER 0x2
 
diff --git a/drivers/video/riva/fbdev.c b/drivers/video/riva/fbdev.c
index 6a9e183be41b..ae297e222681 100644
--- a/drivers/video/riva/fbdev.c
+++ b/drivers/video/riva/fbdev.c
@@ -1826,7 +1826,7 @@ static void __devinit riva_get_EDID(struct fb_info *info, struct pci_dev *pdev)
 #ifdef CONFIG_PPC_OF
 	if (!riva_get_EDID_OF(info, pdev))
 		printk(PFX "could not retrieve EDID from OF\n");
-#elif CONFIG_FB_RIVA_I2C
+#elif defined(CONFIG_FB_RIVA_I2C)
 	if (!riva_get_EDID_i2c(info))
 		printk(PFX "could not retrieve EDID from DDC/I2C\n");
 #endif
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
index df749cc0aac8..c8064cae8f17 100644
--- a/fs/ntfs/sysctl.h
+++ b/fs/ntfs/sysctl.h
@@ -26,7 +26,7 @@
 
 #include <linux/config.h>
 
-#if (DEBUG && CONFIG_SYSCTL)
+#if defined(DEBUG) && defined(CONFIG_SYSCTL)
 
 extern int ntfs_sysctl(int add);
 
diff --git a/include/linux/ftape.h b/include/linux/ftape.h
index c6b38d5b9186..72faeec9f6e1 100644
--- a/include/linux/ftape.h
+++ b/include/linux/ftape.h
@@ -165,7 +165,7 @@ typedef union {
 #  undef  CONFIG_FT_FDC_DMA
 #  define CONFIG_FT_FDC_DMA 2
 # endif
-#elif CONFIG_FT_ALT_FDC == 1  /* CONFIG_FT_MACH2 */
+#elif defined(CONFIG_FT_ALT_FDC)  /* CONFIG_FT_MACH2 */
 # if CONFIG_FT_FDC_BASE == 0
 #  undef  CONFIG_FT_FDC_BASE
 #  define CONFIG_FT_FDC_BASE 0x370
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1f2c2f9e353f..ae652ca14bc9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -792,13 +792,8 @@ int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
 	if (ipv6_addr_any(&fl->fl6_src)) {
 		err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
 
-		if (err) {
-#if IP6_DEBUG >= 2
-			printk(KERN_DEBUG "ip6_dst_lookup: "
-			       "no available source address\n");
-#endif
+		if (err)
 			goto out_err_release;
-		}
 	}
 
 	return 0;
diff --git a/sound/isa/sb/sb_mixer.c b/sound/isa/sb/sb_mixer.c
index cc5a2c6dec16..ff4b59968027 100644
--- a/sound/isa/sb/sb_mixer.c
+++ b/sound/isa/sb/sb_mixer.c
@@ -688,7 +688,7 @@ static struct sbmix_elem snd_als4000_ctl_3d_poweroff_switch =
 	SB_SINGLE("3D PowerOff Switch", SB_ALS4000_3D_TIME_DELAY, 4, 0x01);
 static struct sbmix_elem snd_als4000_ctl_3d_delay =
 	SB_SINGLE("3D Delay", SB_ALS4000_3D_TIME_DELAY, 0, 0x0f);
-#if NOT_AVAILABLE
+#ifdef NOT_AVAILABLE
 static struct sbmix_elem snd_als4000_ctl_fmdac =
 	SB_SINGLE("FMDAC Switch (Option ?)", SB_ALS4000_FMDAC, 0, 0x01);
 static struct sbmix_elem snd_als4000_ctl_qsound =
@@ -723,7 +723,7 @@ static struct sbmix_elem *snd_als4000_controls[] = {
 	&snd_als4000_ctl_3d_output_ratio,
 	&snd_als4000_ctl_3d_delay,
 	&snd_als4000_ctl_3d_poweroff_switch,
-#if NOT_AVAILABLE
+#ifdef NOT_AVAILABLE
 	&snd_als4000_ctl_fmdac,
 	&snd_als4000_ctl_qsound,
 #endif
diff --git a/sound/oss/pss.c b/sound/oss/pss.c
index 3ed38765dcc4..a617ccb40e00 100644
--- a/sound/oss/pss.c
+++ b/sound/oss/pss.c
@@ -714,7 +714,7 @@ static int __init attach_pss(struct address_info *hw_config)
 	 
 	disable_all_emulations();
 
-#if YOU_REALLY_WANT_TO_ALLOCATE_THESE_RESOURCES
+#ifdef YOU_REALLY_WANT_TO_ALLOCATE_THESE_RESOURCES
 	if (sound_alloc_dma(hw_config->dma, "PSS"))
 	{
 		printk("pss.c: Can't allocate DMA channel.\n");
diff --git a/sound/pci/rme9652/rme9652.c b/sound/pci/rme9652/rme9652.c
index f3037402d58f..1bc9d0df8516 100644
--- a/sound/pci/rme9652/rme9652.c
+++ b/sound/pci/rme9652/rme9652.c
@@ -1470,7 +1470,7 @@ static int snd_rme9652_get_tc_valid(snd_kcontrol_t * kcontrol, snd_ctl_elem_valu
 	return 0;
 }
 
-#if ALSA_HAS_STANDARD_WAY_OF_RETURNING_TIMECODE
+#ifdef ALSA_HAS_STANDARD_WAY_OF_RETURNING_TIMECODE
 
 /* FIXME: this routine needs a port to the new control API --jk */
 
-- 
cgit v1.2.3-59-g8ed1b


From e5c2d749172657ed51e20e4b5ab540447666cc50 Mon Sep 17 00:00:00 2001
From: Ralf Baechle <ralf@linux-mips.org>
Date: Wed, 27 Jul 2005 11:45:19 -0700
Subject: [PATCH] serial_core whitespace fix

Use tabs for formatting like anywhere else in this file.

Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/serial_core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 30b64f3534f4..f6fca8f2f3ca 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -104,7 +104,7 @@
 #define PORT_MPSC	63
 
 /* TXX9 type number */
-#define PORT_TXX9       64
+#define PORT_TXX9	64
 
 /* NEC VR4100 series SIU/DSIU */
 #define PORT_VR41XX_SIU		65
-- 
cgit v1.2.3-59-g8ed1b


From 79a8810221ee9ea96c4e5a5817afb88f22ea698c Mon Sep 17 00:00:00 2001
From: Richard Henderson <rth@twiddle.net>
Date: Thu, 28 Jul 2005 01:07:41 -0700
Subject: [PATCH] alpha: fix "statement with no effect" warnings

Apparently gcc 4.0 complains about "({ 0; });", which leads to -Werror
breakage in one of the alpha oprofile modules.

One might could argue that this is a gcc bug, in that statement-expressions
should be considered to be function-like rather than statement-like for the
purposes of this warning.  But it's just as easy to use an inline function
in the first place, side-stepping the issue.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/smp.h |  9 +++++++--
 include/linux/smp.h     | 20 ++++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index 9950706abdf8..a3d09d14fee2 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -50,11 +50,16 @@ extern cpumask_t cpu_online_map;
 extern int smp_num_cpus;
 #define cpu_possible_map	cpu_present_mask
 
-int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, cpumask_t cpu);
+int smp_call_function_on_cpu(void (*) (void *), void *, int, int, cpumask_t);
 
 #else /* CONFIG_SMP */
 
-#define smp_call_function_on_cpu(func,info,retry,wait,cpu)    ({ 0; })
+static inline int
+smp_call_function_on_cpu (void (*func) (void *), void *info, int retry,
+			  int wait, cpumask_t cpu)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9dfa3ee769ae..22b451d1b93f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -94,11 +94,23 @@ void smp_prepare_boot_cpu(void);
  */
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
-#define smp_call_function(func,info,retry,wait)	({ 0; })
-#define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
-static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
-#define smp_prepare_boot_cpu()			do {} while (0)
+
+static inline int smp_call_function(void (*func) (void *), void *info,
+				    int retry, int wait)
+{
+	return 0;
+}
+
+static inline int on_each_cpu(void (*func) (void *), void *info,
+			      int retry, int wait)
+{
+	func(info);
+	return 0;
+}
+
+static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_prepare_boot_cpu(void) { }
 
 #endif /* !SMP */
 
-- 
cgit v1.2.3-59-g8ed1b


From 2ac6608c41f8c45371ea9dddae7f99bc2c15d5cf Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Thu, 28 Jul 2005 10:34:47 -0700
Subject: Revert broken "statement with no effect" warning fix

It may shut up gcc, but it also incorrectly changes the semantics of the
smp_call_function() helpers.

You can fix the warning other ways if you are interested (create another
inline function that takes no arguments and returns zero), but
preferably gcc just shouldn't complain about unused return values from
statement expressions in the first place.
---
 include/asm-alpha/smp.h |  9 ++-------
 include/linux/smp.h     | 20 ++++----------------
 2 files changed, 6 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index a3d09d14fee2..9950706abdf8 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -50,16 +50,11 @@ extern cpumask_t cpu_online_map;
 extern int smp_num_cpus;
 #define cpu_possible_map	cpu_present_mask
 
-int smp_call_function_on_cpu(void (*) (void *), void *, int, int, cpumask_t);
+int smp_call_function_on_cpu(void (*func) (void *info), void *info,int retry, int wait, cpumask_t cpu);
 
 #else /* CONFIG_SMP */
 
-static inline int
-smp_call_function_on_cpu (void (*func) (void *), void *info, int retry,
-			  int wait, cpumask_t cpu)
-{
-	return 0;
-}
+#define smp_call_function_on_cpu(func,info,retry,wait,cpu)    ({ 0; })
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 22b451d1b93f..9dfa3ee769ae 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -94,23 +94,11 @@ void smp_prepare_boot_cpu(void);
  */
 #define raw_smp_processor_id()			0
 #define hard_smp_processor_id()			0
-#define num_booting_cpus()			1
-
-static inline int smp_call_function(void (*func) (void *), void *info,
-				    int retry, int wait)
-{
-	return 0;
-}
-
-static inline int on_each_cpu(void (*func) (void *), void *info,
-			      int retry, int wait)
-{
-	func(info);
-	return 0;
-}
-
+#define smp_call_function(func,info,retry,wait)	({ 0; })
+#define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
-static inline void smp_prepare_boot_cpu(void) { }
+#define num_booting_cpus()			1
+#define smp_prepare_boot_cpu()			do {} while (0)
 
 #endif /* !SMP */
 
-- 
cgit v1.2.3-59-g8ed1b


From a46e812620bd7db457ce002544a1a6572c313d8a Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@freescale.com>
Date: Fri, 29 Jul 2005 12:16:27 -0700
Subject: [PATCH] PCI: fix up errors after dma bursting patch and CONFIG_PCI=n
 -- bug?

In the patch from:

http://www.uwsg.iu.edu/hypermail/linux/kernel/0506.3/0985.html

Is the the following line suppose inside the if CONFIG_PCI=n

  #define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)

Signed-off-by: Kumar Gala <kumar.gala@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pci.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 7ac14961ba22..8621cf42b46f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -971,6 +971,8 @@ static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int en
 
 #define	isa_bridge	((struct pci_dev *)NULL)
 
+#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
+
 #else
 
 /*
@@ -985,9 +987,6 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 	return 0;
 }
 #endif
-
-#define pci_dma_burst_advice(pdev, strat, strategy_parameter) do { } while (0)
-
 #endif /* !CONFIG_PCI */
 
 /* these helpers provide future and backwards compatibility
-- 
cgit v1.2.3-59-g8ed1b


From 87bec66b9691522414862dd8d41e430b063735ef Mon Sep 17 00:00:00 2001
From: David Shaohua Li <shaohua.li@intel.com>
Date: Wed, 27 Jul 2005 23:02:00 -0400
Subject: [ACPI] suspend/resume ACPI PCI Interrupt Links

Add reference count and disable ACPI PCI Interrupt Link
when no device still uses it.

Warn when drivers have not released Link at suspend time.

http://bugzilla.kernel.org/show_bug.cgi?id=3469

Signed-off-by: David Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/i386/pci/acpi.c        |   1 +
 arch/i386/pci/common.c      |   6 +++
 arch/i386/pci/irq.c         |   1 +
 arch/i386/pci/pci.h         |   1 +
 drivers/acpi/pci_irq.c      |  85 +++++++++++++++++++++++++-----------
 drivers/acpi/pci_link.c     | 103 +++++++++++++++++++++++++++++++++++++-------
 include/acpi/acpi_drivers.h |   3 +-
 include/linux/acpi.h        |   4 --
 8 files changed, 157 insertions(+), 47 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/pci/acpi.c b/arch/i386/pci/acpi.c
index 2db65ec45dc3..42913f43feb0 100644
--- a/arch/i386/pci/acpi.c
+++ b/arch/i386/pci/acpi.c
@@ -30,6 +30,7 @@ static int __init pci_acpi_init(void)
 	acpi_irq_penalty_init();
 	pcibios_scanned++;
 	pcibios_enable_irq = acpi_pci_irq_enable;
+	pcibios_disable_irq = acpi_pci_irq_disable;
 
 	if (pci_routeirq) {
 		/*
diff --git a/arch/i386/pci/common.c b/arch/i386/pci/common.c
index 720975e1af50..751e49bda180 100644
--- a/arch/i386/pci/common.c
+++ b/arch/i386/pci/common.c
@@ -249,3 +249,9 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 
 	return pcibios_enable_irq(dev);
 }
+
+void pcibios_disable_device (struct pci_dev *dev)
+{
+	if (pcibios_disable_irq)
+		pcibios_disable_irq(dev);
+}
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index d21b3a2dc978..66e4149ef189 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -56,6 +56,7 @@ struct irq_router_handler {
 };
 
 int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
+void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
 
 /*
  *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
diff --git a/arch/i386/pci/pci.h b/arch/i386/pci/pci.h
index a8fc80ca69f3..dc442dfcab9e 100644
--- a/arch/i386/pci/pci.h
+++ b/arch/i386/pci/pci.h
@@ -72,3 +72,4 @@ extern int pcibios_scanned;
 extern spinlock_t pci_config_lock;
 
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
+extern void (*pcibios_disable_irq)(struct pci_dev *dev);
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 8093f2e00321..c536ccfc5413 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -269,7 +269,51 @@ acpi_pci_irq_del_prt (int segment, int bus)
 /* --------------------------------------------------------------------------
                           PCI Interrupt Routing Support
    -------------------------------------------------------------------------- */
+typedef int (*irq_lookup_func)(struct acpi_prt_entry *, int *, int *, char **);
 
+static int
+acpi_pci_allocate_irq(struct acpi_prt_entry *entry,
+	int	*edge_level,
+	int	*active_high_low,
+	char	**link)
+{
+	int	irq;
+
+	ACPI_FUNCTION_TRACE("acpi_pci_allocate_irq");
+
+	if (entry->link.handle) {
+		irq = acpi_pci_link_allocate_irq(entry->link.handle,
+			entry->link.index, edge_level, active_high_low, link);
+		if (irq < 0) {
+			ACPI_DEBUG_PRINT((ACPI_DB_WARN, "Invalid IRQ link routing entry\n"));
+			return_VALUE(-1);
+		}
+	} else {
+		irq = entry->link.index;
+		*edge_level = ACPI_LEVEL_SENSITIVE;
+		*active_high_low = ACPI_ACTIVE_LOW;
+	}
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found IRQ %d\n", irq));
+	return_VALUE(irq);
+}
+
+static int
+acpi_pci_free_irq(struct acpi_prt_entry *entry,
+	int	*edge_level,
+	int	*active_high_low,
+	char	**link)
+{
+	int	irq;
+
+	ACPI_FUNCTION_TRACE("acpi_pci_free_irq");
+	if (entry->link.handle) {
+		irq = acpi_pci_link_free_irq(entry->link.handle);
+	} else {
+		irq = entry->link.index;
+	}
+	return_VALUE(irq);
+}
 /*
  * acpi_pci_irq_lookup
  * success: return IRQ >= 0
@@ -282,12 +326,13 @@ acpi_pci_irq_lookup (
 	int			pin,
 	int			*edge_level,
 	int			*active_high_low,
-	char			**link)
+	char			**link,
+	irq_lookup_func		func)
 {
 	struct acpi_prt_entry	*entry = NULL;
 	int segment = pci_domain_nr(bus);
 	int bus_nr = bus->number;
-	int irq;
+	int ret;
 
 	ACPI_FUNCTION_TRACE("acpi_pci_irq_lookup");
 
@@ -301,22 +346,8 @@ acpi_pci_irq_lookup (
 		return_VALUE(-1);
 	}
 	
-	if (entry->link.handle) {
-		irq = acpi_pci_link_get_irq(entry->link.handle,
-			entry->link.index, edge_level, active_high_low, link);
-		if (irq < 0) {
-			ACPI_DEBUG_PRINT((ACPI_DB_WARN, "Invalid IRQ link routing entry\n"));
-			return_VALUE(-1);
-		}
-	} else {
-		irq = entry->link.index;
-		*edge_level = ACPI_LEVEL_SENSITIVE;
-		*active_high_low = ACPI_ACTIVE_LOW;
-	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found IRQ %d\n", irq));
-
-	return_VALUE(irq);
+	ret = func(entry, edge_level, active_high_low, link);
+	return_VALUE(ret);
 }
 
 /*
@@ -330,7 +361,8 @@ acpi_pci_irq_derive (
 	int			pin,
 	int			*edge_level,
 	int			*active_high_low,
-	char			**link)
+	char			**link,
+	irq_lookup_func		func)
 {
 	struct pci_dev		*bridge = dev;
 	int			irq = -1;
@@ -363,7 +395,7 @@ acpi_pci_irq_derive (
 		}
 
 		irq = acpi_pci_irq_lookup(bridge->bus, PCI_SLOT(bridge->devfn),
-			pin, edge_level, active_high_low, link);
+			pin, edge_level, active_high_low, link, func);
 	}
 
 	if (irq < 0) {
@@ -415,7 +447,7 @@ acpi_pci_irq_enable (
 	 * values override any BIOS-assigned IRQs set during boot.
 	 */
  	irq = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin,
-		&edge_level, &active_high_low, &link);
+		&edge_level, &active_high_low, &link, acpi_pci_allocate_irq);
 
 	/*
 	 * If no PRT entry was found, we'll try to derive an IRQ from the
@@ -423,7 +455,7 @@ acpi_pci_irq_enable (
 	 */
 	if (irq < 0)
  		irq = acpi_pci_irq_derive(dev, pin, &edge_level,
-			&active_high_low, &link);
+			&active_high_low, &link, acpi_pci_allocate_irq);
  
 	/*
 	 * No IRQ known to the ACPI subsystem - maybe the BIOS / 
@@ -461,7 +493,9 @@ acpi_pci_irq_enable (
 EXPORT_SYMBOL(acpi_pci_irq_enable);
 
 
-#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
+/* FIXME: implement x86/x86_64 version */
+void __attribute__((weak)) acpi_unregister_gsi(u32 i) {}
+
 void
 acpi_pci_irq_disable (
 	struct pci_dev		*dev)
@@ -488,14 +522,14 @@ acpi_pci_irq_disable (
 	 * First we check the PCI IRQ routing table (PRT) for an IRQ.
 	 */
  	gsi = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin,
-				  &edge_level, &active_high_low, NULL);
+			&edge_level, &active_high_low, NULL, acpi_pci_free_irq);
 	/*
 	 * If no PRT entry was found, we'll try to derive an IRQ from the
 	 * device's parent bridge.
 	 */
 	if (gsi < 0)
  		gsi = acpi_pci_irq_derive(dev, pin,
-					  &edge_level, &active_high_low, NULL);
+			&edge_level, &active_high_low, NULL, acpi_pci_free_irq);
 	if (gsi < 0)
 		return_VOID;
 
@@ -511,4 +545,3 @@ acpi_pci_irq_disable (
 
 	return_VOID;
 }
-#endif /* CONFIG_ACPI_DEALLOCATE_IRQ */
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index 6ad0e77df9b3..6a29610edc11 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -68,6 +68,10 @@ static struct acpi_driver acpi_pci_link_driver = {
 			},
 };
 
+/*
+ * If a link is initialized, we never change its active and initialized
+ * later even the link is disable. Instead, we just repick the active irq
+ */
 struct acpi_pci_link_irq {
 	u8			active;			/* Current IRQ */
 	u8			edge_level;		/* All IRQs */
@@ -76,8 +80,7 @@ struct acpi_pci_link_irq {
 	u8			possible_count;
 	u8			possible[ACPI_PCI_LINK_MAX_POSSIBLE];
 	u8			initialized:1;
-	u8			suspend_resume:1;
-	u8			reserved:6;
+	u8			reserved:7;
 };
 
 struct acpi_pci_link {
@@ -85,12 +88,14 @@ struct acpi_pci_link {
 	struct acpi_device	*device;
 	acpi_handle		handle;
 	struct acpi_pci_link_irq irq;
+	int			refcnt;
 };
 
 static struct {
 	int			count;
 	struct list_head	entries;
 }				acpi_link;
+DECLARE_MUTEX(acpi_link_lock);
 
 
 /* --------------------------------------------------------------------------
@@ -532,12 +537,12 @@ static int acpi_pci_link_allocate(
 
 	ACPI_FUNCTION_TRACE("acpi_pci_link_allocate");
 
-	if (link->irq.suspend_resume) {
-		acpi_pci_link_set(link, link->irq.active);
-		link->irq.suspend_resume = 0;
-	}
-	if (link->irq.initialized)
+	if (link->irq.initialized) {
+		if (link->refcnt == 0)
+			/* This means the link is disabled but initialized */
+			acpi_pci_link_set(link, link->irq.active);
 		return_VALUE(0);
+	}
 
 	/*
 	 * search for active IRQ in list of possible IRQs.
@@ -596,13 +601,13 @@ static int acpi_pci_link_allocate(
 }
 
 /*
- * acpi_pci_link_get_irq
+ * acpi_pci_link_allocate_irq
  * success: return IRQ >= 0
  * failure: return -1
  */
 
 int
-acpi_pci_link_get_irq (
+acpi_pci_link_allocate_irq (
 	acpi_handle		handle,
 	int			index,
 	int			*edge_level,
@@ -613,7 +618,7 @@ acpi_pci_link_get_irq (
 	struct acpi_device	*device = NULL;
 	struct acpi_pci_link	*link = NULL;
 
-	ACPI_FUNCTION_TRACE("acpi_pci_link_get_irq");
+	ACPI_FUNCTION_TRACE("acpi_pci_link_allocate_irq");
 
 	result = acpi_bus_get_device(handle, &device);
 	if (result) {
@@ -633,21 +638,70 @@ acpi_pci_link_get_irq (
 		return_VALUE(-1);
 	}
 
-	if (acpi_pci_link_allocate(link))
+	down(&acpi_link_lock);
+	if (acpi_pci_link_allocate(link)) {
+		up(&acpi_link_lock);
 		return_VALUE(-1);
+	}
 	   
 	if (!link->irq.active) {
+		up(&acpi_link_lock);
 		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Link active IRQ is 0!\n"));
 		return_VALUE(-1);
 	}
+	link->refcnt ++;
+	up(&acpi_link_lock);
 
 	if (edge_level) *edge_level = link->irq.edge_level;
 	if (active_high_low) *active_high_low = link->irq.active_high_low;
 	if (name) *name = acpi_device_bid(link->device);
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+		"Link %s is referenced\n", acpi_device_bid(link->device)));
 	return_VALUE(link->irq.active);
 }
 
+/*
+ * We don't change link's irq information here.  After it is reenabled, we
+ * continue use the info
+ */
+int
+acpi_pci_link_free_irq(acpi_handle handle)
+{
+	struct acpi_device	*device = NULL;
+	struct acpi_pci_link	*link = NULL;
+	acpi_status		result;
+
+	ACPI_FUNCTION_TRACE("acpi_pci_link_free_irq");
+
+	result = acpi_bus_get_device(handle, &device);
+	if (result) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid link device\n"));
+		return_VALUE(-1);
+	}
 
+	link = (struct acpi_pci_link *) acpi_driver_data(device);
+	if (!link) {
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid link context\n"));
+		return_VALUE(-1);
+	}
+
+	down(&acpi_link_lock);
+	if (!link->irq.initialized) {
+		up(&acpi_link_lock);
+		ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Link isn't initialized\n"));
+		return_VALUE(-1);
+	}
+
+	link->refcnt --;
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+		"Link %s is dereferenced\n", acpi_device_bid(link->device)));
+
+	if (link->refcnt == 0) {
+		acpi_ut_evaluate_object(link->handle, "_DIS", 0, NULL);
+	}
+	up(&acpi_link_lock);
+	return_VALUE(link->irq.active);
+}
 /* --------------------------------------------------------------------------
                                  Driver Interface
    -------------------------------------------------------------------------- */
@@ -677,6 +731,7 @@ acpi_pci_link_add (
 	strcpy(acpi_device_class(device), ACPI_PCI_LINK_CLASS);
 	acpi_driver_data(device) = link;
 
+	down(&acpi_link_lock);
 	result = acpi_pci_link_get_possible(link);
 	if (result)
 		goto end;
@@ -712,6 +767,7 @@ acpi_pci_link_add (
 end:
 	/* disable all links -- to be activated on use */
 	acpi_ut_evaluate_object(link->handle, "_DIS", 0, NULL);
+	up(&acpi_link_lock);
 
 	if (result)
 		kfree(link);
@@ -726,19 +782,32 @@ irqrouter_suspend(
 {
 	struct list_head        *node = NULL;
 	struct acpi_pci_link    *link = NULL;
+	int			ret = 0;
 
 	ACPI_FUNCTION_TRACE("irqrouter_suspend");
 
 	list_for_each(node, &acpi_link.entries) {
 		link = list_entry(node, struct acpi_pci_link, node);
 		if (!link) {
-			ACPI_DEBUG_PRINT((ACPI_DB_ERROR, "Invalid link context\n"));
+			ACPI_DEBUG_PRINT((ACPI_DB_ERROR,
+				"Invalid link context\n"));
 			continue;
 		}
-		if (link->irq.active && link->irq.initialized)
-			link->irq.suspend_resume = 1;
+		if (link->irq.initialized && link->refcnt != 0
+			/* We ignore legacy IDE device irq */
+			&& link->irq.active != 14 && link->irq.active !=15) {
+			printk(KERN_WARNING PREFIX
+				"%d drivers with interrupt %d neglected to call"
+				" pci_disable_device at .suspend\n",
+				link->refcnt,
+				link->irq.active);
+			printk(KERN_WARNING PREFIX
+				"Fix the driver, or rmmod before suspend\n");
+			link->refcnt = 0;
+			ret = -EINVAL;
+		}
 	}
-	return_VALUE(0);
+	return_VALUE(ret);
 }
 
 
@@ -756,8 +825,9 @@ acpi_pci_link_remove (
 
 	link = (struct acpi_pci_link *) acpi_driver_data(device);
 
-	/* TBD: Acquire/release lock */
+	down(&acpi_link_lock);
 	list_del(&link->node);
+	up(&acpi_link_lock);
 
 	kfree(link);
 
@@ -849,6 +919,7 @@ int __init acpi_irq_balance_set(char *str)
 __setup("acpi_irq_balance", acpi_irq_balance_set);
 
 
+/* FIXME: we will remove this interface after all drivers call pci_disable_device */
 static struct sysdev_class irqrouter_sysdev_class = {
         set_kset_name("irqrouter"),
         .suspend = irqrouter_suspend,
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index e00d9289201b..13f092977c0c 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -56,8 +56,9 @@
 /* ACPI PCI Interrupt Link (pci_link.c) */
 
 int acpi_irq_penalty_init (void);
-int acpi_pci_link_get_irq (acpi_handle handle, int index, int *edge_level,
+int acpi_pci_link_allocate_irq (acpi_handle handle, int index, int *edge_level,
 	int *active_high_low, char **name);
+int acpi_pci_link_free_irq(acpi_handle handle);
 
 /* ACPI PCI Interrupt Routing (pci_irq.c) */
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 9c14959bcfa0..ca0cd240cee0 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -440,9 +440,7 @@ int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
  * If this matches the last registration, any IRQ resources for gsi
  * are freed.
  */
-#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
 void acpi_unregister_gsi (u32 gsi);
-#endif
 
 #ifdef CONFIG_ACPI_PCI
 
@@ -467,9 +465,7 @@ struct pci_dev;
 int acpi_pci_irq_enable (struct pci_dev *dev);
 void acpi_penalize_isa_irq(int irq, int active);
 
-#ifdef CONFIG_ACPI_DEALLOCATE_IRQ
 void acpi_pci_irq_disable (struct pci_dev *dev);
-#endif
 
 struct acpi_pci_driver {
 	struct acpi_pci_driver *next;
-- 
cgit v1.2.3-59-g8ed1b


From 7544953685859875b5ac0260b6b1856066c092d6 Mon Sep 17 00:00:00 2001
From: John McCutchan <ttb@tentacle.dhs.org>
Date: Mon, 1 Aug 2005 11:00:45 -0400
Subject: [PATCH] inotify: fix file deletion by rename detection

When a file is moved over an existing file that you are watching,
inotify won't send you a DELETE_SELF event and it won't unref the inode
until the inotify instance is closed by the application.

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Robert Love <rml@novell.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c               | 2 +-
 include/linux/fsnotify.h | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 02a824cd3c5c..4a27eb798118 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2218,7 +2218,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
 	if (!error) {
 		const char *new_name = old_dentry->d_name.name;
-		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir);
+		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir, new_dentry->d_inode);
 	}
 	fsnotify_oldname_free(old_name);
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index d07a92c94776..e96a4306ab3b 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -21,7 +21,7 @@
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 const char *old_name, const char *new_name,
-				 int isdir)
+				 int isdir, struct inode *target)
 {
 	u32 cookie = inotify_get_cookie();
 
@@ -36,6 +36,11 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		isdir = IN_ISDIR;
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name);
+
+	if (target) {
+		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL);
+		inotify_inode_is_dead(target);
+	}
 }
 
 /*
-- 
cgit v1.2.3-59-g8ed1b


From 0072b1389c25355ccc01048114adb9652c13fd9f Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 1 Aug 2005 21:11:39 -0700
Subject: [PATCH] include/linux/dcookies.h: dummy functions must be "static
 inline"

We don't want these to be global functions.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/dcookies.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h
index c28050136164..1d68428c925d 100644
--- a/include/linux/dcookies.h
+++ b/include/linux/dcookies.h
@@ -48,12 +48,12 @@ int get_dcookie(struct dentry * dentry, struct vfsmount * vfsmnt,
 
 #else
 
-struct dcookie_user * dcookie_register(void)
+static inline struct dcookie_user * dcookie_register(void)
 {
 	return NULL;
 }
 
-void dcookie_unregister(struct dcookie_user * user)
+static inline void dcookie_unregister(struct dcookie_user * user)
 {
 	return;
 }
-- 
cgit v1.2.3-59-g8ed1b


From f33ea7f404e592e4563b12101b7a4d17da6558d7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Wed, 3 Aug 2005 20:24:01 +1000
Subject: [PATCH] fix get_user_pages bug

Checking pte_dirty instead of pte_write in __follow_page is problematic
for s390, and for copy_one_pte which leaves dirty when clearing write.

So revert __follow_page to check pte_write as before, and make
do_wp_page pass back a special extra VM_FAULT_WRITE bit to say it has
done its full job: once get_user_pages receives this value, it no longer
requires pte_write in __follow_page.

But most callers of handle_mm_fault, in the various architectures, have
switch statements which do not expect this new case.  To avoid changing
them all in a hurry, make an inline wrapper function (using the old
name) that masks off the new bit, and use the extended interface with
double underscores.

Yes, we do have a call to do_wp_page from do_swap_page, but no need to
change that: in rare case it's needed, another do_wp_page will follow.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
[ Cleanups by Nick Piggin ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 22 +++++++++++++++++-----
 mm/memory.c        | 31 +++++++++++++++++++++++--------
 2 files changed, 40 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6eb7f48317f8..82d7024f0765 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -625,10 +625,16 @@ static inline int page_mapped(struct page *page)
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
  */
-#define VM_FAULT_OOM	(-1)
-#define VM_FAULT_SIGBUS	0
-#define VM_FAULT_MINOR	1
-#define VM_FAULT_MAJOR	2
+#define VM_FAULT_OOM	0x00
+#define VM_FAULT_SIGBUS	0x01
+#define VM_FAULT_MINOR	0x02
+#define VM_FAULT_MAJOR	0x03
+
+/* 
+ * Special case for get_user_pages.
+ * Must be in a distinct bit from the above VM_FAULT_ flags.
+ */
+#define VM_FAULT_WRITE	0x10
 
 #define offset_in_page(p)	((unsigned long)(p) & ~PAGE_MASK)
 
@@ -704,7 +710,13 @@ extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsign
 extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
 extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot);
 extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot);
-extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
+extern int __handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
+
+static inline int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access)
+{
+	return __handle_mm_fault(mm, vma, address, write_access) & (~VM_FAULT_WRITE);
+}
+
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
diff --git a/mm/memory.c b/mm/memory.c
index 2405289dfdf8..81d7117aa58b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -811,15 +811,18 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address,
 	pte = *ptep;
 	pte_unmap(ptep);
 	if (pte_present(pte)) {
-		if (write && !pte_dirty(pte))
+		if (write && !pte_write(pte))
 			goto out;
 		if (read && !pte_read(pte))
 			goto out;
 		pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (accessed)
+			if (accessed) {
+				if (write && !pte_dirty(pte) &&!PageDirty(page))
+					set_page_dirty(page);
 				mark_page_accessed(page);
+			}
 			return page;
 		}
 	}
@@ -941,10 +944,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 		spin_lock(&mm->page_table_lock);
 		do {
+			int write_access = write;
 			struct page *page;
 
 			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, write))) {
+			while (!(page = follow_page(mm, start, write_access))) {
 				/*
 				 * Shortcut for anonymous pages. We don't want
 				 * to force the creation of pages tables for
@@ -957,7 +961,16 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 					break;
 				}
 				spin_unlock(&mm->page_table_lock);
-				switch (handle_mm_fault(mm,vma,start,write)) {
+				switch (__handle_mm_fault(mm, vma, start,
+							write_access)) {
+				case VM_FAULT_WRITE:
+					/*
+					 * do_wp_page has broken COW when
+					 * necessary, even if maybe_mkwrite
+					 * decided not to set pte_write
+					 */
+					write_access = 0;
+					/* FALLTHRU */
 				case VM_FAULT_MINOR:
 					tsk->min_flt++;
 					break;
@@ -1220,6 +1233,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	struct page *old_page, *new_page;
 	unsigned long pfn = pte_pfn(pte);
 	pte_t entry;
+	int ret;
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1247,7 +1261,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 			lazy_mmu_prot_update(entry);
 			pte_unmap(page_table);
 			spin_unlock(&mm->page_table_lock);
-			return VM_FAULT_MINOR;
+			return VM_FAULT_MINOR|VM_FAULT_WRITE;
 		}
 	}
 	pte_unmap(page_table);
@@ -1274,6 +1288,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
+	ret = VM_FAULT_MINOR;
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, pte))) {
@@ -1290,12 +1305,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
 
 		/* Free the old page.. */
 		new_page = old_page;
+		ret |= VM_FAULT_WRITE;
 	}
 	pte_unmap(page_table);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
 	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_MINOR;
+	return ret;
 
 no_new_page:
 	page_cache_release(old_page);
@@ -1987,7 +2003,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 	if (write_access) {
 		if (!pte_write(entry))
 			return do_wp_page(mm, vma, address, pte, pmd, entry);
-
 		entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
@@ -2002,7 +2017,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 /*
  * By the time we get here, we already hold the mm semaphore
  */
-int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 		unsigned long address, int write_access)
 {
 	pgd_t *pgd;
-- 
cgit v1.2.3-59-g8ed1b


From 6b8b3e8a8b3e62b4209eaa36697e3c9df457e196 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@cse.unsw.edu.au>
Date: Thu, 4 Aug 2005 12:53:35 -0700
Subject: [PATCH] md: make sure md bitmap updates are flushed when array is
 stopped.

The recent change to never ignore the bitmap, revealed that the bitmap isn't
begin flushed properly when an array is stopped.

We call bitmap_daemon_work three times as there is a three-stage pipeline for
flushing updates to the bitmap file.

Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/bitmap.c         | 23 +++++++++++++++++++++++
 drivers/md/md.c             |  2 ++
 include/linux/raid/bitmap.h |  1 +
 3 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 09d32db06d20..41df4cda66e2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1450,6 +1450,29 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset)
 
 }
 
+/*
+ * flush out any pending updates
+ */
+void bitmap_flush(mddev_t *mddev)
+{
+	struct bitmap *bitmap = mddev->bitmap;
+	int sleep;
+
+	if (!bitmap) /* there was no bitmap */
+		return;
+
+	/* run the daemon_work three time to ensure everything is flushed
+	 * that can be
+	 */
+	sleep = bitmap->daemon_sleep;
+	bitmap->daemon_sleep = 0;
+	bitmap_daemon_work(bitmap);
+	bitmap_daemon_work(bitmap);
+	bitmap_daemon_work(bitmap);
+	bitmap->daemon_sleep = sleep;
+	bitmap_update_sb(bitmap);
+}
+
 /*
  * free memory that was allocated
  */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9fd4dbea0d0d..480f658db6f2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1798,6 +1798,8 @@ static int do_md_stop(mddev_t * mddev, int ro)
 				goto out;
 			mddev->ro = 1;
 		} else {
+			bitmap_flush(mddev);
+			wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
 			if (mddev->ro)
 				set_disk_ro(disk, 0);
 			blk_queue_make_request(mddev->queue, md_fail_request);
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 6213e976eade..4bf1659f8aa8 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -248,6 +248,7 @@ struct bitmap {
 
 /* these are used only by md/bitmap */
 int  bitmap_create(mddev_t *mddev);
+void bitmap_flush(mddev_t *mddev);
 void bitmap_destroy(mddev_t *mddev);
 int  bitmap_active(struct bitmap *bitmap);
 
-- 
cgit v1.2.3-59-g8ed1b


From fec59a711eef002d4ef9eb8de09dd0a26986eb77 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Thu, 4 Aug 2005 18:06:10 -0700
Subject: [PATCH] PCI: restore BAR values after D3hot->D0 for devices that need
 it

Some PCI devices (e.g. 3c905B, 3c556B) lose all configuration
(including BARs) when transitioning from D3hot->D0.  This leaves such
a device in an inaccessible state.  The patch below causes the BARs
to be restored when enabling such a device, so that its driver will
be able to access it.

The patch also adds pci_restore_bars as a new global symbol, and adds a
correpsonding EXPORT_SYMBOL_GPL for that.

Some firmware (e.g. Thinkpad T21) leaves devices in D3hot after a
(re)boot.  Most drivers call pci_enable_device very early, so devices
left in D3hot that lose configuration during the D3hot->D0 transition
will be inaccessible to their drivers.

Drivers could be modified to account for this, but it would
be difficult to know which drivers need modification.  This is
especially true since often many devices are covered by the same
driver.  It likely would be necessary to replicate code across dozens
of drivers.

The patch below should trigger only when transitioning from D3hot->D0
(or at boot), and only for devices that have the "no soft reset" bit
cleared in the PM control register.  I believe it is safe to include
this patch as part of the PCI infrastructure.

The cleanest implementation of pci_restore_bars was to call
pci_update_resource.  Unfortunately, that does not currently exist
for the sparc64 architecture.  The patch below includes a null
implemenation of pci_update_resource for sparc64.

Some have expressed interest in making general use of the the
pci_restore_bars function, so that has been exported to GPL licensed
modules.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/sparc64/kernel/pci.c |  6 +++++
 drivers/pci/pci.c         | 59 +++++++++++++++++++++++++++++++++++++++++++----
 drivers/pci/setup-res.c   |  2 +-
 include/linux/pci.h       |  3 +++
 4 files changed, 65 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index bba140d98b1b..914e125d3971 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -413,6 +413,12 @@ static int pci_assign_bus_resource(const struct pci_bus *bus,
 	return -EBUSY;
 }
 
+void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno)
+{
+	/* Not implemented for sparc64... */
+	BUG();
+}
+
 int pci_assign_resource(struct pci_dev *pdev, int resource)
 {
 	struct pcidev_cookie *pcp = pdev->sysdata;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 1b34fc56067e..65ea7d25f691 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -221,6 +221,37 @@ pci_find_parent_resource(const struct pci_dev *dev, struct resource *res)
 	return best;
 }
 
+/**
+ * pci_restore_bars - restore a devices BAR values (e.g. after wake-up)
+ * @dev: PCI device to have its BARs restored
+ *
+ * Restore the BAR values for a given device, so as to make it
+ * accessible by its driver.
+ */
+void
+pci_restore_bars(struct pci_dev *dev)
+{
+	int i, numres;
+
+	switch (dev->hdr_type) {
+	case PCI_HEADER_TYPE_NORMAL:
+		numres = 6;
+		break;
+	case PCI_HEADER_TYPE_BRIDGE:
+		numres = 2;
+		break;
+	case PCI_HEADER_TYPE_CARDBUS:
+		numres = 1;
+		break;
+	default:
+		/* Should never get here, but just in case... */
+		return;
+	}
+
+	for (i = 0; i < numres; i ++)
+		pci_update_resource(dev, &dev->resource[i], i);
+}
+
 /**
  * pci_set_power_state - Set the power state of a PCI device
  * @dev: PCI device to be suspended
@@ -239,7 +270,7 @@ int (*platform_pci_set_power_state)(struct pci_dev *dev, pci_power_t t);
 int
 pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 {
-	int pm;
+	int pm, need_restore = 0;
 	u16 pmcsr, pmc;
 
 	/* bound the state we're entering */
@@ -278,14 +309,17 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 			return -EIO;
 	}
 
+	pci_read_config_word(dev, pm + PCI_PM_CTRL, &pmcsr);
+
 	/* If we're in D3, force entire word to 0.
 	 * This doesn't affect PME_Status, disables PME_En, and
 	 * sets PowerState to 0.
 	 */
-	if (dev->current_state >= PCI_D3hot)
+	if (dev->current_state >= PCI_D3hot) {
+		if (!(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET))
+			need_restore = 1;
 		pmcsr = 0;
-	else {
-		pci_read_config_word(dev, pm + PCI_PM_CTRL, &pmcsr);
+	} else {
 		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
 		pmcsr |= state;
 	}
@@ -308,6 +342,22 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 		platform_pci_set_power_state(dev, state);
 
 	dev->current_state = state;
+
+	/* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT
+	 * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning
+	 * from D3hot to D0 _may_ perform an internal reset, thereby
+	 * going to "D0 Uninitialized" rather than "D0 Initialized".
+	 * For example, at least some versions of the 3c905B and the
+	 * 3c556B exhibit this behaviour.
+	 *
+	 * At least some laptop BIOSen (e.g. the Thinkpad T21) leave
+	 * devices in a D3hot state at boot.  Consequently, we need to
+	 * restore at least the BARs so that the device will be
+	 * accessible to its driver.
+	 */
+	if (need_restore)
+		pci_restore_bars(dev);
+
 	return 0;
 }
 
@@ -805,6 +855,7 @@ struct pci_dev *isa_bridge;
 EXPORT_SYMBOL(isa_bridge);
 #endif
 
+EXPORT_SYMBOL_GPL(pci_restore_bars);
 EXPORT_SYMBOL(pci_enable_device_bars);
 EXPORT_SYMBOL(pci_enable_device);
 EXPORT_SYMBOL(pci_disable_device);
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 1ca21d2ba11c..878fd0a65c02 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -26,7 +26,7 @@
 #include "pci.h"
 
 
-static void
+void
 pci_update_resource(struct pci_dev *dev, struct resource *res, int resno)
 {
 	struct pci_bus_region region;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8621cf42b46f..98bdd95fcee9 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -225,6 +225,7 @@
 #define  PCI_PM_CAP_PME_D3cold  0x8000  /* PME# from D3 (cold) */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
@@ -816,7 +817,9 @@ int pci_set_mwi(struct pci_dev *dev);
 void pci_clear_mwi(struct pci_dev *dev);
 int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
+void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int pci_assign_resource(struct pci_dev *dev, int i);
+void pci_restore_bars(struct pci_dev *dev);
 
 /* ROM control related routines */
 void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size);
-- 
cgit v1.2.3-59-g8ed1b


From 0c3dba1534569734ba353afdf3f11def497ff2ac Mon Sep 17 00:00:00 2001
From: John McCutchan <ttb@tentacle.dhs.org>
Date: Thu, 4 Aug 2005 21:12:54 -0400
Subject: [PATCH] Clean up inotify delete race fix

This avoids the whole #ifdef mess by just getting a copy of
dentry->d_inode before d_delete is called - that makes the codepaths the
same for the INOTIFY/DNOTIFY cases as for the regular no-notify case.
I've been running this under a Gnome session for the last 10 minutes.
Inotify is being used extensively.

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c               | 9 ++-------
 include/linux/fsnotify.h | 4 +---
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 83559dce4286..32accb6a672f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1874,14 +1874,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
-#if defined(CONFIG_INOTIFY) || defined(CONFIG_DNOTIFY)
-		dget(dentry);
+		struct inode *inode = dentry->d_inode;
 		d_delete(dentry);
-		fsnotify_unlink(dentry, dir);
-		dput(dentry);
-#else
-		d_delete(dentry);
-#endif
+		fsnotify_unlink(dentry, inode, dir);
 	}
 
 	return error;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index e96a4306ab3b..1cb4935348d8 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -46,10 +46,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 /*
  * fsnotify_unlink - file was unlinked
  */
-static inline void fsnotify_unlink(struct dentry *dentry, struct inode *dir)
+static inline void fsnotify_unlink(struct dentry *dentry, struct inode *inode, struct inode *dir)
 {
-	struct inode *inode = dentry->d_inode;
-
 	inode_dir_notify(dir, DN_DELETE);
 	inotify_inode_queue_event(dir, IN_DELETE, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
-- 
cgit v1.2.3-59-g8ed1b


From ba02508248e90a9d696aebd18b48a3290235b53c Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Fri, 5 Aug 2005 13:28:11 -0700
Subject: [PATCH] blk: fix tag shrinking (revive real_max_size)

My patch in commit fa72b903f75e4f0f0b2c2feed093005167da4023 incorrectly
removed blk_queue_tag->real_max_depth.

The original resize implementation was incorrect in the following
points.

 * actual allocation size of tag_index was shorter than real_max_size,
   but assumed to be of the same size, possibly causing memory access
   beyond the allocated area.
 * bits in tag_map between max_deptn and real_max_depth were
   initialized to 1's, making the tags permanently reserved.

In an attempt to fix above two bugs, I had removed allocation optimization
in init_tag_map and real_max_size.  Tag map/index were allocated and freed
immediately during resize.

Unfortunately, I wasn't considering that tag map/index can be resized
dynamically with tags beyond new_depth active.  This led to accessing
freed area after shrinking tags and led to the following bug reporting
thread on linux-scsi.

   http://marc.theaimsgroup.com/?l=linux-scsi&m=112319898111885&w=2

To fix the problem, I've revived real_max_depth without allocation
optimization in init_tag_map, and Andrew Vasquez confirmed that the
problem was fixed.  As Jens is not going to be available for a week, he
asked me to make sure that this patch reaches you.

   http://marc.theaimsgroup.com/?l=linux-scsi&m=112325778530886&w=2

Also, a comment was added to make sure that real_max_size is needed for
dynamic shrinking.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 18 +++++++++++++++---
 include/linux/blkdev.h    |  1 +
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 692a5fced76e..3c818544475e 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -719,7 +719,7 @@ struct request *blk_queue_find_tag(request_queue_t *q, int tag)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 
-	if (unlikely(bqt == NULL || tag >= bqt->max_depth))
+	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 		return NULL;
 
 	return bqt->tag_index[tag];
@@ -798,6 +798,7 @@ init_tag_map(request_queue_t *q, struct blk_queue_tag *tags, int depth)
 
 	memset(tag_index, 0, depth * sizeof(struct request *));
 	memset(tag_map, 0, nr_ulongs * sizeof(unsigned long));
+	tags->real_max_depth = depth;
 	tags->max_depth = depth;
 	tags->tag_index = tag_index;
 	tags->tag_map = tag_map;
@@ -871,12 +872,23 @@ int blk_queue_resize_tags(request_queue_t *q, int new_depth)
 	if (!bqt)
 		return -ENXIO;
 
+	/*
+	 * if we already have large enough real_max_depth.  just
+	 * adjust max_depth.  *NOTE* as requests with tag value
+	 * between new_depth and real_max_depth can be in-flight, tag
+	 * map can not be shrunk blindly here.
+	 */
+	if (new_depth <= bqt->real_max_depth) {
+		bqt->max_depth = new_depth;
+		return 0;
+	}
+
 	/*
 	 * save the old state info, so we can copy it back
 	 */
 	tag_index = bqt->tag_index;
 	tag_map = bqt->tag_map;
-	max_depth = bqt->max_depth;
+	max_depth = bqt->real_max_depth;
 
 	if (init_tag_map(q, bqt, new_depth))
 		return -ENOMEM;
@@ -913,7 +925,7 @@ void blk_queue_end_tag(request_queue_t *q, struct request *rq)
 
 	BUG_ON(tag == -1);
 
-	if (unlikely(tag >= bqt->max_depth))
+	if (unlikely(tag >= bqt->real_max_depth))
 		/*
 		 * This can happen after tag depth has been reduced.
 		 * FIXME: how about a warning or info message here?
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0881b5cdee3d..19bd8e7e11bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -301,6 +301,7 @@ struct blk_queue_tag {
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
+	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From 243393c90f2b7cb781fd794e22786e9c8547901a Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 6 Aug 2005 09:39:57 -0700
Subject: Add fakey 'deflateBound()' function to the in-kernel zlib routines

It's not the real deflateBound() in newer zlib libraries, partly because
the upcoming usage of it won't have the "stream" available, so we can't
have the same interfaces anyway.
---
 include/linux/zlib.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/zlib.h b/include/linux/zlib.h
index 850076ea14d3..74f7b78c22d2 100644
--- a/include/linux/zlib.h
+++ b/include/linux/zlib.h
@@ -506,6 +506,11 @@ extern int zlib_deflateReset (z_streamp strm);
    stream state was inconsistent (such as zalloc or state being NULL).
 */
 
+static inline unsigned long deflateBound(unsigned long s)
+{
+	return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11;
+}
+
 extern int zlib_deflateParams (z_streamp strm, int level, int strategy);
 /*
      Dynamically update the compression level and compression strategy.  The
-- 
cgit v1.2.3-59-g8ed1b


From 9ae5b3c703cce89a7d8ccf25fe16955ec6f016c0 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olh@suse.de>
Date: Sun, 7 Aug 2005 09:42:24 -0700
Subject: [PATCH] remove linux/pagemap.h from linux/swap.h

sparc can not include linux/pagemap.h because of the following circular
dependency:

asm-sparc/pgtable include linux/swap.h
linux/swap.h include now linux/pagemap.h
linux/pagemap.h include linux/mm.h
linux/mm.h include asm/pgtable.h

It needs to have the swp_entry_t type fully visible in pgtable.h,
we can't work around this using macros.

Signed-off-by: Olaf Hering <olh@suse.de>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 239f520cc49e..bfe3e763ccf2 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -7,7 +7,6 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
-#include <linux/pagemap.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -255,6 +254,8 @@ static inline void put_swap_token(struct mm_struct *mm)
 
 #define si_swapinfo(val) \
 	do { (val)->freeswap = (val)->totalswap = 0; } while (0)
+/* only sparc can not include linux/pagemap.h in this file
+ * so leave page_cache_release and release_pages undeclared... */
 #define free_page_and_swap_cache(page) \
 	page_cache_release(page)
 #define free_pages_and_swap_cache(pages, nr) \
-- 
cgit v1.2.3-59-g8ed1b


From 7a91bf7f5c22c8407a9991cbd9ce5bb87caa6b4a Mon Sep 17 00:00:00 2001
From: John McCutchan <ttb@tentacle.dhs.org>
Date: Mon, 8 Aug 2005 13:52:16 -0400
Subject: [PATCH] fsnotify_name/inoderemove

The patch below unhooks fsnotify from vfs_unlink & vfs_rmdir.  It
introduces two new fsnotify calls, that are hooked in at the dcache
level.  This not only more closely matches how the VFS layer works, it
also avoids the problem with locking and inode lifetimes.

The two functions are

 - fsnotify_nameremove -- called when a directory entry is going away.
   It notifies the PARENT of the deletion.  This is called from
   d_delete().

 - inoderemove -- called when the files inode itself is going away.  It
   notifies the inode that is being deleted.  This is called from
   dentry_iput().

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c              |  7 +++++++
 fs/namei.c               |  3 ---
 include/linux/fsnotify.h | 20 ++++++++++++++++++++
 3 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/dcache.c b/fs/dcache.c
index 3aa8a7e980d8..a15a2e1f5520 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
+#include <linux/fsnotify.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
@@ -101,6 +102,7 @@ static inline void dentry_iput(struct dentry * dentry)
 		list_del_init(&dentry->d_alias);
 		spin_unlock(&dentry->d_lock);
 		spin_unlock(&dcache_lock);
+		fsnotify_inoderemove(inode);
 		if (dentry->d_op && dentry->d_op->d_iput)
 			dentry->d_op->d_iput(dentry, inode);
 		else
@@ -1165,13 +1167,16 @@ out:
  
 void d_delete(struct dentry * dentry)
 {
+	int isdir = 0;
 	/*
 	 * Are we the only user?
 	 */
 	spin_lock(&dcache_lock);
 	spin_lock(&dentry->d_lock);
+	isdir = S_ISDIR(dentry->d_inode->i_mode);
 	if (atomic_read(&dentry->d_count) == 1) {
 		dentry_iput(dentry);
+		fsnotify_nameremove(dentry, isdir);
 		return;
 	}
 
@@ -1180,6 +1185,8 @@ void d_delete(struct dentry * dentry)
 
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&dcache_lock);
+
+	fsnotify_nameremove(dentry, isdir);
 }
 
 static void __d_rehash(struct dentry * entry, struct hlist_head *list)
diff --git a/fs/namei.c b/fs/namei.c
index 32accb6a672f..57046d98a746 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1802,7 +1802,6 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	up(&dentry->d_inode->i_sem);
 	if (!error) {
 		d_delete(dentry);
-		fsnotify_rmdir(dentry, dentry->d_inode, dir);
 	}
 	dput(dentry);
 
@@ -1874,9 +1873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
-		struct inode *inode = dentry->d_inode;
 		d_delete(dentry);
-		fsnotify_unlink(dentry, inode, dir);
 	}
 
 	return error;
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 1cb4935348d8..9db31d251c20 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -67,6 +67,26 @@ static inline void fsnotify_rmdir(struct dentry *dentry, struct inode *inode,
 	inotify_inode_is_dead(inode);
 }
 
+/*
+ * fsnotify_nameremove - a filename was removed from a directory
+ */
+static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
+{
+	if (isdir)
+		isdir = IN_ISDIR;
+	dnotify_parent(dentry, DN_DELETE);
+	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+}
+
+/*
+ * fsnotify_inoderemove - an inode is going away
+ */
+static inline void fsnotify_inoderemove(struct inode *inode)
+{
+	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
+	inotify_inode_is_dead(inode);
+}
+
 /*
  * fsnotify_create - 'name' was linked in
  */
-- 
cgit v1.2.3-59-g8ed1b


From 4d479e40e1748a877a24015fc6727b27b77110cd Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 8 Aug 2005 13:48:02 -0700
Subject: [NETLINK]: Allocate and kill some netlink numbers.

NETLINK_ARPD is unused, allocate it to the Open-iSCSI folks.

NETLINK_ROUTE6 and NETLINK_TAPBASE are no longer used, delete
them.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 70c2a9dc4b2b..6552b71bfa73 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -12,15 +12,13 @@
 #define NETLINK_NFLOG		5	/* netfilter/iptables ULOG */
 #define NETLINK_XFRM		6	/* ipsec */
 #define NETLINK_SELINUX		7	/* SELinux event notifications */
-#define NETLINK_ARPD		8
+#define NETLINK_ISCSI		8	/* Open-iSCSI */
 #define NETLINK_AUDIT		9	/* auditing */
 #define NETLINK_FIB_LOOKUP	10	
-#define NETLINK_ROUTE6		11	/* af_inet6 route comm channel */
 #define NETLINK_NETFILTER	12	/* netfilter subsystem */
 #define NETLINK_IP6_FW		13
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
 #define NETLINK_KOBJECT_UEVENT	15	/* Kernel messages to userspace */
-#define NETLINK_TAPBASE		16	/* 16 to 31 are ethertap */
 
 #define MAX_LINKS 32		
 
-- 
cgit v1.2.3-59-g8ed1b


From dc836b5b6fcde95f750a4790d8200fabaf563dc9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 8 Aug 2005 18:46:09 -0700
Subject: Revert "[PATCH] PCI: restore BAR values..."

Revert commit fec59a711eef002d4ef9eb8de09dd0a26986eb77, which is
breaking sparc64 that doesn't have a working pci_update_resource.

We'll re-do this after 2.6.13 when we'll do it all properly.
---
 arch/sparc64/kernel/pci.c |  6 -----
 drivers/pci/pci.c         | 59 ++++-------------------------------------------
 drivers/pci/setup-res.c   |  2 +-
 include/linux/pci.h       |  3 ---
 4 files changed, 5 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c
index 914e125d3971..bba140d98b1b 100644
--- a/arch/sparc64/kernel/pci.c
+++ b/arch/sparc64/kernel/pci.c
@@ -413,12 +413,6 @@ static int pci_assign_bus_resource(const struct pci_bus *bus,
 	return -EBUSY;
 }
 
-void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno)
-{
-	/* Not implemented for sparc64... */
-	BUG();
-}
-
 int pci_assign_resource(struct pci_dev *pdev, int resource)
 {
 	struct pcidev_cookie *pcp = pdev->sysdata;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 65ea7d25f691..1b34fc56067e 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -221,37 +221,6 @@ pci_find_parent_resource(const struct pci_dev *dev, struct resource *res)
 	return best;
 }
 
-/**
- * pci_restore_bars - restore a devices BAR values (e.g. after wake-up)
- * @dev: PCI device to have its BARs restored
- *
- * Restore the BAR values for a given device, so as to make it
- * accessible by its driver.
- */
-void
-pci_restore_bars(struct pci_dev *dev)
-{
-	int i, numres;
-
-	switch (dev->hdr_type) {
-	case PCI_HEADER_TYPE_NORMAL:
-		numres = 6;
-		break;
-	case PCI_HEADER_TYPE_BRIDGE:
-		numres = 2;
-		break;
-	case PCI_HEADER_TYPE_CARDBUS:
-		numres = 1;
-		break;
-	default:
-		/* Should never get here, but just in case... */
-		return;
-	}
-
-	for (i = 0; i < numres; i ++)
-		pci_update_resource(dev, &dev->resource[i], i);
-}
-
 /**
  * pci_set_power_state - Set the power state of a PCI device
  * @dev: PCI device to be suspended
@@ -270,7 +239,7 @@ int (*platform_pci_set_power_state)(struct pci_dev *dev, pci_power_t t);
 int
 pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 {
-	int pm, need_restore = 0;
+	int pm;
 	u16 pmcsr, pmc;
 
 	/* bound the state we're entering */
@@ -309,17 +278,14 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 			return -EIO;
 	}
 
-	pci_read_config_word(dev, pm + PCI_PM_CTRL, &pmcsr);
-
 	/* If we're in D3, force entire word to 0.
 	 * This doesn't affect PME_Status, disables PME_En, and
 	 * sets PowerState to 0.
 	 */
-	if (dev->current_state >= PCI_D3hot) {
-		if (!(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET))
-			need_restore = 1;
+	if (dev->current_state >= PCI_D3hot)
 		pmcsr = 0;
-	} else {
+	else {
+		pci_read_config_word(dev, pm + PCI_PM_CTRL, &pmcsr);
 		pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
 		pmcsr |= state;
 	}
@@ -342,22 +308,6 @@ pci_set_power_state(struct pci_dev *dev, pci_power_t state)
 		platform_pci_set_power_state(dev, state);
 
 	dev->current_state = state;
-
-	/* According to section 5.4.1 of the "PCI BUS POWER MANAGEMENT
-	 * INTERFACE SPECIFICATION, REV. 1.2", a device transitioning
-	 * from D3hot to D0 _may_ perform an internal reset, thereby
-	 * going to "D0 Uninitialized" rather than "D0 Initialized".
-	 * For example, at least some versions of the 3c905B and the
-	 * 3c556B exhibit this behaviour.
-	 *
-	 * At least some laptop BIOSen (e.g. the Thinkpad T21) leave
-	 * devices in a D3hot state at boot.  Consequently, we need to
-	 * restore at least the BARs so that the device will be
-	 * accessible to its driver.
-	 */
-	if (need_restore)
-		pci_restore_bars(dev);
-
 	return 0;
 }
 
@@ -855,7 +805,6 @@ struct pci_dev *isa_bridge;
 EXPORT_SYMBOL(isa_bridge);
 #endif
 
-EXPORT_SYMBOL_GPL(pci_restore_bars);
 EXPORT_SYMBOL(pci_enable_device_bars);
 EXPORT_SYMBOL(pci_enable_device);
 EXPORT_SYMBOL(pci_disable_device);
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index 589486704ce3..84eedc965688 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -26,7 +26,7 @@
 #include "pci.h"
 
 
-void
+static void
 pci_update_resource(struct pci_dev *dev, struct resource *res, int resno)
 {
 	struct pci_bus_region region;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 98bdd95fcee9..8621cf42b46f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -225,7 +225,6 @@
 #define  PCI_PM_CAP_PME_D3cold  0x8000  /* PME# from D3 (cold) */
 #define PCI_PM_CTRL		4	/* PM control and status register */
 #define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
-#define  PCI_PM_CTRL_NO_SOFT_RESET	0x0004	/* No reset for D3hot->D0 */
 #define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
 #define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* Data select (??) */
 #define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* Data scale (??) */
@@ -817,9 +816,7 @@ int pci_set_mwi(struct pci_dev *dev);
 void pci_clear_mwi(struct pci_dev *dev);
 int pci_set_dma_mask(struct pci_dev *dev, u64 mask);
 int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
-void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int pci_assign_resource(struct pci_dev *dev, int i);
-void pci_restore_bars(struct pci_dev *dev);
 
 /* ROM control related routines */
 void __iomem *pci_map_rom(struct pci_dev *pdev, size_t *size);
-- 
cgit v1.2.3-59-g8ed1b


From 00dd1e433967872f3997a45d5adf35056fdf2f56 Mon Sep 17 00:00:00 2001
From: John McCutchan <ttb@tentacle.dhs.org>
Date: Mon, 8 Aug 2005 22:13:05 -0400
Subject: [PATCH] fsnotify-cleanups

This removes the now unused fsnotify_unlink & fsnotify_rmdir code.
Compile tested.

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fsnotify.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 9db31d251c20..602c305c8585 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -43,30 +43,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	}
 }
 
-/*
- * fsnotify_unlink - file was unlinked
- */
-static inline void fsnotify_unlink(struct dentry *dentry, struct inode *inode, struct inode *dir)
-{
-	inode_dir_notify(dir, DN_DELETE);
-	inotify_inode_queue_event(dir, IN_DELETE, 0, dentry->d_name.name);
-	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL);
-
-	inotify_inode_is_dead(inode);
-}
-
-/*
- * fsnotify_rmdir - directory was removed
- */
-static inline void fsnotify_rmdir(struct dentry *dentry, struct inode *inode,
-				  struct inode *dir)
-{
-	inode_dir_notify(dir, DN_DELETE);
-	inotify_inode_queue_event(dir,IN_DELETE|IN_ISDIR,0,dentry->d_name.name);
-	inotify_inode_queue_event(inode, IN_DELETE_SELF | IN_ISDIR, 0, NULL);
-	inotify_inode_is_dead(inode);
-}
-
 /*
  * fsnotify_nameremove - a filename was removed from a directory
  */
-- 
cgit v1.2.3-59-g8ed1b


From 86b3786078d63242d3194ffc58ae8dae1d1bbef3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@lameter.com>
Date: Tue, 9 Aug 2005 19:59:21 -0700
Subject: [PATCH] Fix ide-disk.c oops caused by hwif == NULL

1. Move hwif_to_node to ide.h

2. Use hwif_to_node in ide-disk.c

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/ide/ide-disk.c  | 2 +-
 drivers/ide/ide-probe.c | 9 ---------
 include/linux/ide.h     | 6 ++++++
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index f9c1acb4ed6a..c9d3a00a3c0c 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -1220,7 +1220,7 @@ static int ide_disk_probe(struct device *dev)
 		goto failed;
 
 	g = alloc_disk_node(1 << PARTN_BITS,
-			pcibus_to_node(drive->hwif->pci_dev->bus));
+			hwif_to_node(drive->hwif));
 	if (!g)
 		goto out_free_idkp;
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 94daf40ae323..c1128ae5cd2f 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -960,15 +960,6 @@ static void save_match(ide_hwif_t *hwif, ide_hwif_t *new, ide_hwif_t **match)
 }
 #endif /* MAX_HWIFS > 1 */
 
-static inline int hwif_to_node(ide_hwif_t *hwif)
-{
-	if (hwif->pci_dev)
-		return pcibus_to_node(hwif->pci_dev->bus);
-	else
-		/* Add ways to determine the node of other busses here */
-		return -1;
-}
-
 /*
  * init request queue
  */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 92129078d4f3..a6dbb51ecd7b 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1501,4 +1501,10 @@ extern struct bus_type ide_bus_type;
 #define ide_id_has_flush_cache_ext(id)	\
 	(((id)->cfs_enable_2 & 0x2400) == 0x2400)
 
+static inline int hwif_to_node(ide_hwif_t *hwif)
+{
+	struct pci_dev *dev = hwif->pci_dev;
+	return dev ? pcibus_to_node(dev->bus) : -1;
+}
+
 #endif /* _IDE_H */
-- 
cgit v1.2.3-59-g8ed1b


From a0d3bea3cf6c7c1b53a46432bd490b5dc784ca42 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 11 Aug 2005 16:05:50 -0700
Subject: [NET]: Make skb->protocol __be16

There are many instances of

	skb->protocol = htons(ETH_P_*);
	skb->protocol = __constant_htons(ETH_P_*);
and
	skb->protocol = *_type_trans(...);

Most of *_type_trans() are already endian-annotated, so, let's shift
attention on other warnings.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 0061c9470482..948527e42a60 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -255,7 +255,7 @@ struct sk_buff {
 				nohdr:1;
 				/* 3 bits spare */
 	__u8			pkt_type;
-	__u16			protocol;
+	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
 #ifdef CONFIG_NETFILTER
-- 
cgit v1.2.3-59-g8ed1b


From 0db1d6fc1ea051af49ebe03c503d23996a7c5bbb Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:25:54 -0700
Subject: [NETPOLL]: add retry timeout

Add limited retry logic to netpoll_send_skb

Each time we attempt to send, decrement our per-device retry counter.
On every successful send, we reset the counter.

We delay 50us between attempts with up to 20000 retries for a total of
1 second. After we've exhausted our retries, subsequent failed
attempts will try only once until reset by success.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  1 +
 net/core/netpoll.c      | 13 ++++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index bcd0ac33f592..be68d94b03d5 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -26,6 +26,7 @@ struct netpoll {
 struct netpoll_info {
 	spinlock_t poll_lock;
 	int poll_owner;
+	int tries;
 	int rx_flags;
 	spinlock_t rx_lock;
 	struct netpoll *rx_np; /* netpoll that registered an rx_hook */
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 59ed186e4f46..d09affdbad3c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -33,6 +33,7 @@
 #define MAX_UDP_CHUNK 1460
 #define MAX_SKBS 32
 #define MAX_QUEUE_DEPTH (MAX_SKBS / 2)
+#define MAX_RETRIES 20000
 
 static DEFINE_SPINLOCK(skb_list_lock);
 static int nr_skbs;
@@ -265,7 +266,8 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		return;
 	}
 
-	while (1) {
+	do {
+		npinfo->tries--;
 		spin_lock(&np->dev->xmit_lock);
 		np->dev->xmit_lock_owner = smp_processor_id();
 
@@ -277,6 +279,7 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 			np->dev->xmit_lock_owner = -1;
 			spin_unlock(&np->dev->xmit_lock);
 			netpoll_poll(np);
+			udelay(50);
 			continue;
 		}
 
@@ -285,12 +288,15 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
 		spin_unlock(&np->dev->xmit_lock);
 
 		/* success */
-		if(!status)
+		if(!status) {
+			npinfo->tries = MAX_RETRIES; /* reset */
 			return;
+		}
 
 		/* transmit busy */
 		netpoll_poll(np);
-	}
+		udelay(50);
+	} while (npinfo->tries > 0);
 }
 
 void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
@@ -642,6 +648,7 @@ int netpoll_setup(struct netpoll *np)
 		npinfo->rx_np = NULL;
 		npinfo->poll_lock = SPIN_LOCK_UNLOCKED;
 		npinfo->poll_owner = -1;
+		npinfo->tries = MAX_RETRIES;
 		npinfo->rx_lock = SPIN_LOCK_UNLOCKED;
 	} else
 		npinfo = ndev->npinfo;
-- 
cgit v1.2.3-59-g8ed1b


From 53fb95d3c14290fd6ee808b221e35493f096246f Mon Sep 17 00:00:00 2001
From: Matt Mackall <mpm@selenic.com>
Date: Thu, 11 Aug 2005 19:27:43 -0700
Subject: [NETPOLL]: fix initialization/NAPI race

This fixes a race during initialization with the NAPI softirq
processing by using an RCU approach.

This race was discovered when refill_skbs() was added to
the setup code.

Signed-off-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h | 19 +++++++++++++------
 net/core/dev.c          |  9 +++++----
 net/core/netpoll.c      |  3 +++
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index be68d94b03d5..5ade54a78dbb 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -9,6 +9,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/interrupt.h>
+#include <linux/rcupdate.h>
 #include <linux/list.h>
 
 struct netpoll;
@@ -61,25 +62,31 @@ static inline int netpoll_rx(struct sk_buff *skb)
 	return ret;
 }
 
-static inline void netpoll_poll_lock(struct net_device *dev)
+static inline void *netpoll_poll_lock(struct net_device *dev)
 {
+	rcu_read_lock(); /* deal with race on ->npinfo */
 	if (dev->npinfo) {
 		spin_lock(&dev->npinfo->poll_lock);
 		dev->npinfo->poll_owner = smp_processor_id();
+		return dev->npinfo;
 	}
+	return NULL;
 }
 
-static inline void netpoll_poll_unlock(struct net_device *dev)
+static inline void netpoll_poll_unlock(void *have)
 {
-	if (dev->npinfo) {
-		dev->npinfo->poll_owner = -1;
-		spin_unlock(&dev->npinfo->poll_lock);
+	struct netpoll_info *npi = have;
+
+	if (npi) {
+		npi->poll_owner = -1;
+		spin_unlock(&npi->poll_lock);
 	}
+	rcu_read_unlock();
 }
 
 #else
 #define netpoll_rx(a) 0
-#define netpoll_poll_lock(a)
+#define netpoll_poll_lock(a) 0
 #define netpoll_poll_unlock(a)
 #endif
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 52a3bf7ae177..faf59b02c4bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1696,7 +1696,8 @@ static void net_rx_action(struct softirq_action *h)
 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
-	
+	void *have;
+
 	local_irq_disable();
 
 	while (!list_empty(&queue->poll_list)) {
@@ -1709,10 +1710,10 @@ static void net_rx_action(struct softirq_action *h)
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
-		netpoll_poll_lock(dev);
+		have = netpoll_poll_lock(dev);
 
 		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
-			netpoll_poll_unlock(dev);
+			netpoll_poll_unlock(have);
 			local_irq_disable();
 			list_del(&dev->poll_list);
 			list_add_tail(&dev->poll_list, &queue->poll_list);
@@ -1721,7 +1722,7 @@ static void net_rx_action(struct softirq_action *h)
 			else
 				dev->quota = dev->weight;
 		} else {
-			netpoll_poll_unlock(dev);
+			netpoll_poll_unlock(have);
 			dev_put(dev);
 			local_irq_disable();
 		}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c02a08da6d42..996787bca17f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -732,6 +732,9 @@ int netpoll_setup(struct netpoll *np)
 	/* last thing to do is link it to the net device structure */
 	ndev->npinfo = npinfo;
 
+	/* avoid racing with NAPI reading npinfo */
+	synchronize_rcu();
+
 	return 0;
 
  release:
-- 
cgit v1.2.3-59-g8ed1b


From 89204c40a03346cd951e698d854105db4cfedc28 Mon Sep 17 00:00:00 2001
From: John McCutchan <jmccutchan@novell.com>
Date: Mon, 15 Aug 2005 12:13:28 -0400
Subject: [PATCH] inotify: add MOVE_SELF event

This adds a MOVE_SELF event to inotify.  It is sent whenever the inode
you are watching is moved.  We need this event so that we can catch
something like this:

 - app1:
	watch /etc/mtab

 - app2:
	cp /etc/mtab /tmp/mtab-work
	mv /etc/mtab /etc/mtab~
	mv /tmp/mtab-work /etc/mtab

app1 still thinks it's watching /etc/mtab but it's actually watching
/etc/mtab~.

Signed-off-by: John McCutchan <ttb@tentacle.dhs.org>
Signed-off-by: Robert Love <rml@novell.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namei.c               | 3 ++-
 include/linux/fsnotify.h | 6 +++++-
 include/linux/inotify.h  | 4 +++-
 3 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/namei.c b/fs/namei.c
index 57046d98a746..b85f158aef0c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2216,7 +2216,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
 	if (!error) {
 		const char *new_name = old_dentry->d_name.name;
-		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir, new_dentry->d_inode);
+		fsnotify_move(old_dir, new_dir, old_name, new_name, is_dir,
+			      new_dentry->d_inode, old_dentry->d_inode);
 	}
 	fsnotify_oldname_free(old_name);
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 602c305c8585..03b8e7932b83 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -21,7 +21,7 @@
  */
 static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 const char *old_name, const char *new_name,
-				 int isdir, struct inode *target)
+				 int isdir, struct inode *target, struct inode *source)
 {
 	u32 cookie = inotify_get_cookie();
 
@@ -41,6 +41,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL);
 		inotify_inode_is_dead(target);
 	}
+
+	if (source) {
+		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL);
+	}
 }
 
 /*
diff --git a/include/linux/inotify.h b/include/linux/inotify.h
index a40c2bf0408e..93bb3afe646b 100644
--- a/include/linux/inotify.h
+++ b/include/linux/inotify.h
@@ -35,6 +35,7 @@ struct inotify_event {
 #define IN_CREATE		0x00000100	/* Subfile was created */
 #define IN_DELETE		0x00000200	/* Subfile was deleted */
 #define IN_DELETE_SELF		0x00000400	/* Self was deleted */
+#define IN_MOVE_SELF		0x00000800	/* Self was moved */
 
 /* the following are legal events.  they are sent as needed to any watch */
 #define IN_UNMOUNT		0x00002000	/* Backing fs was unmounted */
@@ -56,7 +57,8 @@ struct inotify_event {
  */
 #define IN_ALL_EVENTS	(IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \
 			 IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
-			 IN_MOVED_TO | IN_DELETE | IN_CREATE | IN_DELETE_SELF)
+			 IN_MOVED_TO | IN_DELETE | IN_CREATE | IN_DELETE_SELF | \
+			 IN_MOVE_SELF)
 
 #ifdef __KERNEL__
 
-- 
cgit v1.2.3-59-g8ed1b


From 58fcb8df0bf663bb6b8f46cd3010bfe8d13d97cf Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 10 Aug 2005 18:15:12 -0400
Subject: [PATCH] NFS: Ensure ACL xdr code doesn't overflow.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs_common/nfsacl.c     | 1 +
 include/linux/sunrpc/xdr.h | 1 +
 net/sunrpc/xdr.c           | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index 18c58c32e326..251e5a1bb1c4 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -239,6 +239,7 @@ nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
 	if (xdr_decode_word(buf, base, &entries) ||
 	    entries > NFS_ACL_MAX_ENTRIES)
 		return -EINVAL;
+	nfsacl_desc.desc.array_maxlen = entries;
 	err = xdr_decode_array2(buf, base + 4, &nfsacl_desc.desc);
 	if (err)
 		return err;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 34ec3e8d99b3..23448d0fb5bc 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -177,6 +177,7 @@ typedef int (*xdr_xcode_elem_t)(struct xdr_array2_desc *desc, void *elem);
 struct xdr_array2_desc {
 	unsigned int elem_size;
 	unsigned int array_len;
+	unsigned int array_maxlen;
 	xdr_xcode_elem_t xcode;
 };
 
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 8a4d9c106af1..fde16f40a581 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -993,6 +993,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
 			return -EINVAL;
 	} else {
 		if (xdr_decode_word(buf, base, &desc->array_len) != 0 ||
+		    desc->array_len > desc->array_maxlen ||
 		    (unsigned long) base + 4 + desc->array_len *
 				    desc->elem_size > buf->len)
 			return -EINVAL;
-- 
cgit v1.2.3-59-g8ed1b


From 65e4308d2500e7daf60c3dccc202c61ffb066c63 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 16 Aug 2005 11:49:44 -0400
Subject: [PATCH] NFS: Ensure we always update inode->i_mode when doing O_EXCL
 creates

When the client performs an exclusive create and opens the file for writing,
a Netapp filer will first create the file using the mode 01777. It does this
since an NFSv3/v4 exclusive create cannot immediately set the mode bits.
The 01777 mode then gets put into the inode->i_mode. After the file creation
is successful, we then do a setattr to change the mode to the correct value
(as per the NFS spec).

The problem is that nfs_refresh_inode() no longer updates inode->i_mode, so
the latter retains the 01777 mode. A bit later, the VFS notices this, and calls
remove_suid(). This of course now resets the file mode to inode->i_mode & 0777.
Hey presto, the file mode on the server is now magically changed to 0777. Duh...

Fixes http://bugzilla.linux-nfs.org/show_bug.cgi?id=32

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/inode.c         | 37 ++++++++++++++++++++++++-------------
 fs/nfs/nfs3proc.c      |  4 ++++
 fs/nfs/nfs4proc.c      | 10 ++++++++--
 fs/nfs/proc.c          |  2 ++
 include/linux/nfs_fs.h |  1 +
 5 files changed, 39 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4845911f1c63..bb7ca022bcb2 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -814,28 +814,39 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
 		nfs_wb_all(inode);
 	}
 	error = NFS_PROTO(inode)->setattr(dentry, &fattr, attr);
-	if (error == 0) {
+	if (error == 0)
 		nfs_refresh_inode(inode, &fattr);
+	nfs_end_data_update(inode);
+	unlock_kernel();
+	return error;
+}
+
+/**
+ * nfs_setattr_update_inode - Update inode metadata after a setattr call.
+ * @inode: pointer to struct inode
+ * @attr: pointer to struct iattr
+ *
+ * Note: we do this in the *proc.c in order to ensure that
+ *       it works for things like exclusive creates too.
+ */
+void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
+{
+	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) {
 		if ((attr->ia_valid & ATTR_MODE) != 0) {
-			int mode;
-			mode = inode->i_mode & ~S_IALLUGO;
-			mode |= attr->ia_mode & S_IALLUGO;
+			int mode = attr->ia_mode & S_IALLUGO;
+			mode |= inode->i_mode & ~S_IALLUGO;
 			inode->i_mode = mode;
 		}
 		if ((attr->ia_valid & ATTR_UID) != 0)
 			inode->i_uid = attr->ia_uid;
 		if ((attr->ia_valid & ATTR_GID) != 0)
 			inode->i_gid = attr->ia_gid;
-		if ((attr->ia_valid & ATTR_SIZE) != 0) {
-			inode->i_size = attr->ia_size;
-			vmtruncate(inode, attr->ia_size);
-		}
-	}
-	if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0)
 		NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-	nfs_end_data_update(inode);
-	unlock_kernel();
-	return error;
+	}
+	if ((attr->ia_valid & ATTR_SIZE) != 0) {
+		inode->i_size = attr->ia_size;
+		vmtruncate(inode, attr->ia_size);
+	}
 }
 
 /*
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 7851569b31c6..2681485cf2d0 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -120,6 +120,8 @@ nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	dprintk("NFS call  setattr\n");
 	fattr->valid = 0;
 	status = rpc_call(NFS_CLIENT(inode), NFS3PROC_SETATTR, &arg, fattr, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
 	dprintk("NFS reply setattr: %d\n", status);
 	return status;
 }
@@ -370,6 +372,8 @@ again:
 		 * not sure this buys us anything (and I'd have
 		 * to revamp the NFSv3 XDR code) */
 		status = nfs3_proc_setattr(dentry, &fattr, sattr);
+		if (status == 0)
+			nfs_setattr_update_inode(dentry->d_inode, sattr);
 		nfs_refresh_inode(dentry->d_inode, &fattr);
 		dprintk("NFS reply setattr (post-create): %d\n", status);
 	}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1b76f80aedb9..0c5a308e4963 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -753,6 +753,7 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
                 .rpc_argp       = &arg,
                 .rpc_resp       = &res,
         };
+	int status;
 
         fattr->valid = 0;
 
@@ -762,7 +763,8 @@ static int _nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
 	} else
 		memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid));
 
-	return rpc_call_sync(server->client, &msg, 0);
+	status = rpc_call_sync(server->client, &msg, 0);
+	return status;
 }
 
 static int nfs4_do_setattr(struct nfs_server *server, struct nfs_fattr *fattr,
@@ -1145,6 +1147,8 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 	status = nfs4_do_setattr(NFS_SERVER(inode), fattr,
 			NFS_FH(inode), sattr, state);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
 	if (state != NULL)
 		nfs4_close_state(state, FMODE_WRITE);
 	put_rpccred(cred);
@@ -1449,8 +1453,10 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		struct nfs_fattr fattr;
 		status = nfs4_do_setattr(NFS_SERVER(dir), &fattr,
 		                     NFS_FH(state->inode), sattr, state);
-		if (status == 0)
+		if (status == 0) {
+			nfs_setattr_update_inode(state->inode, sattr);
 			goto out;
+		}
 	} else if (flags != 0)
 		goto out;
 	nfs4_close_state(state, flags);
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index cedf636bcf3c..be23c3fb9260 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -114,6 +114,8 @@ nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 	dprintk("NFS call  setattr\n");
 	fattr->valid = 0;
 	status = rpc_call(NFS_CLIENT(inode), NFSPROC_SETATTR, &arg, fattr, 0);
+	if (status == 0)
+		nfs_setattr_update_inode(inode, sattr);
 	dprintk("NFS reply setattr: %d\n", status);
 	return status;
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8ea249110fb0..7d78a783c64a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -292,6 +292,7 @@ extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
 extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *);
 extern void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping);
 extern int nfs_setattr(struct dentry *, struct iattr *);
+extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr);
 extern void nfs_begin_attr_update(struct inode *);
 extern void nfs_end_attr_update(struct inode *);
 extern void nfs_begin_data_update(struct inode *);
-- 
cgit v1.2.3-59-g8ed1b


From 4602b88d9743b5f20655de8078fb42e9fd25581f Mon Sep 17 00:00:00 2001
From: Kristen Accardi <kristen.c.accardi@intel.com>
Date: Tue, 16 Aug 2005 15:15:58 -0700
Subject: [PATCH] PCI: 6700/6702PXH quirk

On the 6700/6702 PXH part, a MSI may get corrupted if an ACPI hotplug
driver and SHPC driver in MSI mode are used together.

This patch will prevent MSI from being enabled for the SHPC as part of
an early pci quirk, as well as on any pci device which sets the no_msi
bit.

Signed-off-by: Kristen Carlson Accardi <kristen.c.accardi@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/pci/msi.c       |  5 ++++-
 drivers/pci/pci.h       |  2 +-
 drivers/pci/quirks.c    | 21 +++++++++++++++++++++
 include/linux/pci.h     |  3 ++-
 include/linux/pci_ids.h |  5 +++++
 5 files changed, 33 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b5ab9aa6ff7c..2b85aa39f954 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -453,7 +453,7 @@ static void enable_msi_mode(struct pci_dev *dev, int pos, int type)
 	}
 }
 
-static void disable_msi_mode(struct pci_dev *dev, int pos, int type)
+void disable_msi_mode(struct pci_dev *dev, int pos, int type)
 {
 	u16 control;
 
@@ -699,6 +699,9 @@ int pci_enable_msi(struct pci_dev* dev)
 	if (!pci_msi_enable || !dev)
  		return status;
 
+	if (dev->no_msi)
+		return status;
+
 	temp = dev->irq;
 
 	if ((status = msi_init()) < 0)
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index d94d7af4f7a0..fa36094aa0f9 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -46,7 +46,7 @@ extern int pci_msi_quirk;
 #else
 #define pci_msi_quirk 0
 #endif
-
+void disable_msi_mode(struct pci_dev *dev, int pos, int type);
 extern int pcie_mch_quirk;
 extern struct device_attribute pci_dev_attrs[];
 extern struct class_device_attribute class_device_attr_cpuaffinity;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index a9160ad16581..bb36bb69803f 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1291,6 +1291,27 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7520_MCH,	quir
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7320_MCH,	quirk_pcie_mch );
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_E7525_MCH,	quirk_pcie_mch );
 
+
+/*
+ * It's possible for the MSI to get corrupted if shpc and acpi
+ * are used together on certain PXH-based systems.
+ */
+static void __devinit quirk_pcie_pxh(struct pci_dev *dev)
+{
+	disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI),
+					PCI_CAP_ID_MSI);
+	dev->no_msi = 1;
+
+	printk(KERN_WARNING "PCI: PXH quirk detected, "
+		"disabling MSI for SHPC device\n");
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXHD_0,	quirk_pcie_pxh);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXHD_1,	quirk_pcie_pxh);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXH_0,	quirk_pcie_pxh);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXH_1,	quirk_pcie_pxh);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL,	PCI_DEVICE_ID_INTEL_PXHV,	quirk_pcie_pxh);
+
+
 static void __devinit quirk_netmos(struct pci_dev *dev)
 {
 	unsigned int num_parallel = (dev->subsystem_device & 0xf0) >> 4;
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8621cf42b46f..bc4c40000c0d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -556,7 +556,8 @@ struct pci_dev {
 	/* keep track of device state */
 	unsigned int	is_enabled:1;	/* pci_enable_device has been called */
 	unsigned int	is_busmaster:1; /* device is busmaster */
-	
+	unsigned int	no_msi:1;	/* device may not use msi */
+
 	u32		saved_config_space[16]; /* config space saved at suspend time */
 	struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
 	int rom_attr_enabled;		/* has display of the rom attribute been enabled? */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index bc4cc10fabe9..51e61e96051c 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2281,6 +2281,11 @@
 #define PCI_VENDOR_ID_INTEL		0x8086
 #define PCI_DEVICE_ID_INTEL_EESSC	0x0008
 #define PCI_DEVICE_ID_INTEL_21145	0x0039
+#define PCI_DEVICE_ID_INTEL_PXHD_0	0x0320
+#define PCI_DEVICE_ID_INTEL_PXHD_1	0x0321
+#define PCI_DEVICE_ID_INTEL_PXH_0	0x0329
+#define PCI_DEVICE_ID_INTEL_PXH_1	0x032A
+#define PCI_DEVICE_ID_INTEL_PXHV	0x032C
 #define PCI_DEVICE_ID_INTEL_82375	0x0482
 #define PCI_DEVICE_ID_INTEL_82424	0x0483
 #define PCI_DEVICE_ID_INTEL_82378	0x0484
-- 
cgit v1.2.3-59-g8ed1b


From 5529680981807b44abf3be30fb6d612ff04f68ff Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@citi.umich.edu>
Date: Thu, 18 Aug 2005 11:24:09 -0700
Subject: [PATCH] NFS: split nfsi->flags into two fields

Certain bits in nfsi->flags can be manipulated with atomic bitops, and some
are better manipulated via logical bitmask operations.

This patch splits the flags field into two.  The next patch introduces atomic
bitops for one of the fields.

Test plan:
 Millions of fsx ops on SMP clients.

Signed-off-by: Chuck Lever <cel@netapp.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/dir.c           | 16 +++++++------
 fs/nfs/file.c          |  5 +++--
 fs/nfs/inode.c         | 61 ++++++++++++++++++++++++++------------------------
 fs/nfs/nfs3acl.c       |  2 +-
 fs/nfs/read.c          |  4 ++--
 include/linux/nfs_fs.h | 27 +++++++++++++---------
 6 files changed, 63 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index b38a57e78a63..5732e13cd0da 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -189,7 +189,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		goto error;
 	}
 	SetPageUptodate(page);
-	NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME;
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_sem or some other mechanism.
@@ -462,7 +462,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
-	NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME;
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
@@ -608,7 +608,7 @@ static inline int nfs_check_verifier(struct inode *dir, struct dentry *dentry)
 {
 	if (IS_ROOT(dentry))
 		return 1;
-	if ((NFS_FLAGS(dir) & NFS_INO_INVALID_ATTR) != 0
+	if ((NFS_I(dir)->cache_validity & NFS_INO_INVALID_ATTR) != 0
 			|| nfs_attribute_timeout(dir))
 		return 0;
 	return nfs_verify_change_attribute(dir, (unsigned long)dentry->d_fsdata);
@@ -1575,11 +1575,12 @@ out:
 
 int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
 {
-	struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache = &nfsi->cache_access;
 
 	if (cache->cred != cred
 			|| time_after(jiffies, cache->jiffies + NFS_ATTRTIMEO(inode))
-			|| (NFS_FLAGS(inode) & NFS_INO_INVALID_ACCESS))
+			|| (nfsi->cache_validity & NFS_INO_INVALID_ACCESS))
 		return -ENOENT;
 	memcpy(res, cache, sizeof(*res));
 	return 0;
@@ -1587,14 +1588,15 @@ int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs
 
 void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 {
-	struct nfs_access_entry *cache = &NFS_I(inode)->cache_access;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_access_entry *cache = &nfsi->cache_access;
 
 	if (cache->cred != set->cred) {
 		if (cache->cred)
 			put_rpccred(cache->cred);
 		cache->cred = get_rpccred(set->cred);
 	}
-	NFS_FLAGS(inode) &= ~NFS_INO_INVALID_ACCESS;
+	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
 	cache->jiffies = set->jiffies;
 	cache->mask = set->mask;
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 5621ba9885f4..f6b9eda925c5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -134,9 +134,10 @@ nfs_file_release(struct inode *inode, struct file *filp)
  */
 static int nfs_revalidate_file(struct inode *inode, struct file *filp)
 {
+	struct nfs_inode *nfsi = NFS_I(inode);
 	int retval = 0;
 
-	if ((NFS_FLAGS(inode) & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode))
+	if ((nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode))
 		retval = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	nfs_revalidate_mapping(inode, filp->f_mapping);
 	return 0;
@@ -164,7 +165,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 		goto force_reval;
 	if (nfsi->npages != 0)
 		return 0;
-	if (!(NFS_FLAGS(inode) & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
+	if (!(nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) && !nfs_attribute_timeout(inode))
 		return 0;
 force_reval:
 	return __nfs_revalidate_inode(server, inode);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bb7ca022bcb2..622184553516 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -620,9 +620,9 @@ nfs_zap_caches(struct inode *inode)
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 	else
-		nfsi->flags |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 }
 
 static void nfs_zap_acl_cache(struct inode *inode)
@@ -632,7 +632,7 @@ static void nfs_zap_acl_cache(struct inode *inode)
 	clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
 	if (clear_acl_cache != NULL)
 		clear_acl_cache(inode);
-	NFS_I(inode)->flags &= ~NFS_INO_INVALID_ACL;
+	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
 }
 
 /*
@@ -841,7 +841,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 			inode->i_uid = attr->ia_uid;
 		if ((attr->ia_valid & ATTR_GID) != 0)
 			inode->i_gid = attr->ia_gid;
-		NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
 	}
 	if ((attr->ia_valid & ATTR_SIZE) != 0) {
 		inode->i_size = attr->ia_size;
@@ -872,8 +872,7 @@ nfs_wait_on_inode(struct inode *inode, int flag)
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
-	struct nfs_inode *nfsi = NFS_I(inode);
-	int need_atime = nfsi->flags & NFS_INO_INVALID_ATIME;
+	int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME;
 	int err;
 
 	if (__IS_FLG(inode, MS_NOATIME))
@@ -1019,7 +1018,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	struct nfs_fattr fattr;
 	struct nfs_inode *nfsi = NFS_I(inode);
 	unsigned long verifier;
-	unsigned int flags;
+	unsigned long cache_validity;
 
 	dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n",
 		inode->i_sb->s_id, (long long)NFS_FILEID(inode));
@@ -1036,7 +1035,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 			goto out_nowait;
 		if (NFS_ATTRTIMEO(inode) == 0)
 			continue;
-		if (NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME))
+		if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME))
 			continue;
 		status = NFS_STALE(inode) ? -ESTALE : 0;
 		goto out_nowait;
@@ -1065,18 +1064,21 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 			 (long long)NFS_FILEID(inode), status);
 		goto out;
 	}
-	flags = nfsi->flags;
-	nfsi->flags &= ~NFS_INO_REVAL_PAGECACHE;
+	cache_validity = nfsi->cache_validity;
+	nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+
 	/*
 	 * We may need to keep the attributes marked as invalid if
 	 * we raced with nfs_end_attr_update().
 	 */
 	if (verifier == nfsi->cache_change_attribute)
-		nfsi->flags &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
-	/* Do the page cache invalidation */
+		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+
 	nfs_revalidate_mapping(inode, inode->i_mapping);
-	if (flags & NFS_INO_INVALID_ACL)
+
+	if (cache_validity & NFS_INO_INVALID_ACL)
 		nfs_zap_acl_cache(inode);
+
 	dfprintk(PAGECACHE, "NFS: (%s/%Ld) revalidation complete\n",
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
@@ -1107,7 +1109,7 @@ int nfs_attribute_timeout(struct inode *inode)
  */
 int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 {
-	if (!(NFS_FLAGS(inode) & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
+	if (!(NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA))
 			&& !nfs_attribute_timeout(inode))
 		return NFS_STALE(inode) ? -ESTALE : 0;
 	return __nfs_revalidate_inode(server, inode);
@@ -1122,14 +1124,14 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
 
-	if (nfsi->flags & NFS_INO_INVALID_DATA) {
+	if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
 		if (S_ISREG(inode->i_mode)) {
 			if (filemap_fdatawrite(mapping) == 0)
 				filemap_fdatawait(mapping);
 			nfs_wb_all(inode);
 		}
 		invalidate_inode_pages2(mapping);
-		nfsi->flags &= ~NFS_INO_INVALID_DATA;
+		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 		if (S_ISDIR(inode->i_mode)) {
 			memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 			/* This ensures we revalidate child dentries */
@@ -1164,10 +1166,10 @@ void nfs_end_data_update(struct inode *inode)
 
 	if (!nfs_have_delegation(inode, FMODE_READ)) {
 		/* Mark the attribute cache for revalidation */
-		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		/* Directories and symlinks: invalidate page cache too */
 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-			nfsi->flags |= NFS_INO_INVALID_DATA;
+			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
 	}
 	nfsi->cache_change_attribute ++;
 	atomic_dec(&nfsi->data_updates);
@@ -1200,9 +1202,9 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 				&& nfsi->change_attr == fattr->pre_change_attr)
 			nfsi->change_attr = fattr->change_attr;
 		if (nfsi->change_attr != fattr->change_attr) {
-			nfsi->flags |= NFS_INO_INVALID_ATTR;
+			nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 			if (!data_unstable)
-				nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+				nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
 		}
 	}
 
@@ -1227,28 +1229,28 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 
 	/* Verify a few of the more important attributes */
 	if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
-		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (!data_unstable)
-			nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
 	}
 	if (cur_size != new_isize) {
-		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		if (nfsi->npages == 0)
-			nfsi->flags |= NFS_INO_REVAL_PAGECACHE;
+			nfsi->cache_validity |= NFS_INO_REVAL_PAGECACHE;
 	}
 
 	/* Have any file permissions changed? */
 	if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
 			|| inode->i_uid != fattr->uid
 			|| inode->i_gid != fattr->gid)
-		nfsi->flags |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
 
 	/* Has the link count changed? */
 	if (inode->i_nlink != fattr->nlink)
-		nfsi->flags |= NFS_INO_INVALID_ATTR;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 
 	if (!timespec_equal(&inode->i_atime, &fattr->atime))
-		nfsi->flags |= NFS_INO_INVALID_ATIME;
+		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
 
 	nfsi->read_cache_jiffies = fattr->timestamp;
 	return 0;
@@ -1384,7 +1386,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 				|| S_ISLNK(inode->i_mode)))
 		invalid &= ~NFS_INO_INVALID_DATA;
 	if (!nfs_have_delegation(inode, FMODE_READ))
-		nfsi->flags |= invalid;
+		nfsi->cache_validity |= invalid;
 
 	return 0;
  out_changed:
@@ -1961,7 +1963,8 @@ static struct inode *nfs_alloc_inode(struct super_block *sb)
 	nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, SLAB_KERNEL);
 	if (!nfsi)
 		return NULL;
-	nfsi->flags = 0;
+	nfsi->flags = 0UL;
+	nfsi->cache_validity = 0UL;
 #ifdef CONFIG_NFS_V3_ACL
 	nfsi->acl_access = ERR_PTR(-EAGAIN);
 	nfsi->acl_default = ERR_PTR(-EAGAIN);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1b7a3ef2f813..a020e650ffc2 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -308,7 +308,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	nfs_begin_data_update(inode);
 	status = rpc_call(server->client_acl, ACLPROC3_SETACL,
 			  &args, &fattr, 0);
-	NFS_FLAGS(inode) |= NFS_INO_INVALID_ACCESS;
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
 	nfs_end_data_update(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 6f866b8aa2d5..90df0500ca1b 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -140,7 +140,7 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 		if (rdata->res.eof != 0 || result == 0)
 			break;
 	} while (count);
-	NFS_FLAGS(inode) |= NFS_INO_INVALID_ATIME;
+	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 
 	if (count)
 		memclear_highpage_flush(page, rdata->args.pgbase, count);
@@ -473,7 +473,7 @@ void nfs_readpage_result(struct rpc_task *task)
 		}
 		task->tk_status = -EIO;
 	}
-	NFS_FLAGS(data->inode) |= NFS_INO_INVALID_ATIME;
+	NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
 	data->complete(data, status);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 7d78a783c64a..229a1755842a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -113,6 +113,7 @@ struct nfs_inode {
 	 * Various flags
 	 */
 	unsigned int		flags;
+	unsigned long		cache_validity;
 
 	/*
 	 * read_cache_jiffies is when we started read-caching this inode,
@@ -188,17 +189,21 @@ struct nfs_inode {
 };
 
 /*
- * Legal inode flag values
+ * Cache validity bit flags
  */
-#define NFS_INO_STALE		0x0001		/* possible stale inode */
-#define NFS_INO_ADVISE_RDPLUS   0x0002          /* advise readdirplus */
-#define NFS_INO_REVALIDATING	0x0004		/* revalidating attrs */
-#define NFS_INO_INVALID_ATTR	0x0008		/* cached attrs are invalid */
-#define NFS_INO_INVALID_DATA	0x0010		/* cached data is invalid */
-#define NFS_INO_INVALID_ATIME	0x0020		/* cached atime is invalid */
-#define NFS_INO_INVALID_ACCESS	0x0040		/* cached access cred invalid */
-#define NFS_INO_INVALID_ACL	0x0080		/* cached acls are invalid */
-#define NFS_INO_REVAL_PAGECACHE	0x1000		/* must revalidate pagecache */
+#define NFS_INO_INVALID_ATTR	0x0001		/* cached attrs are invalid */
+#define NFS_INO_INVALID_DATA	0x0002		/* cached data is invalid */
+#define NFS_INO_INVALID_ATIME	0x0004		/* cached atime is invalid */
+#define NFS_INO_INVALID_ACCESS	0x0008		/* cached access cred invalid */
+#define NFS_INO_INVALID_ACL	0x0010		/* cached acls are invalid */
+#define NFS_INO_REVAL_PAGECACHE	0x0020		/* must revalidate pagecache */
+
+/*
+ * Legal values of flags field
+ */
+#define NFS_INO_REVALIDATING	0x0001		/* revalidating attrs */
+#define NFS_INO_ADVISE_RDPLUS	0x0002		/* advise readdirplus */
+#define NFS_INO_STALE		0x0004		/* possible stale inode */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
@@ -237,7 +242,7 @@ static inline int nfs_caches_unstable(struct inode *inode)
 static inline void NFS_CACHEINV(struct inode *inode)
 {
 	if (!nfs_caches_unstable(inode))
-		NFS_FLAGS(inode) |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
 }
 
 static inline int nfs_server_capable(struct inode *inode, int cap)
-- 
cgit v1.2.3-59-g8ed1b


From 412d582ec1dd59aab2353f8cb7e74f2c79cd20b9 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@citi.umich.edu>
Date: Thu, 18 Aug 2005 11:24:11 -0700
Subject: [PATCH] NFS: use atomic bitops to manipulate flags in nfsi->flags

Introduce atomic bitops to manipulate the bits in the nfs_inode structure's
"flags" field.

Using bitops means we can use a generic wait_on_bit call instead of an ad hoc
locking scheme in fs/nfs/inode.c, so we can remove the "nfs_i_wait" field from
nfs_inode at the same time.

The other new flags field will continue to use bitmask and logic AND and OR.
This permits several flags to be set at the same time efficiently.  The
following patch adds a spin lock to protect these flags, and this spin lock
will later cover other fields in the nfs_inode structure, amortizing the cost
of using this type of serialization.

Test plan:
 Millions of fsx ops on SMP clients.

Signed-off-by: Chuck Lever <cel@netapp.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/dir.c           |  4 +--
 fs/nfs/inode.c         | 69 +++++++++++++++++++++++++++++++-------------------
 include/linux/nfs_fs.h | 19 ++++++--------
 3 files changed, 53 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5732e13cd0da..27cf5577f239 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -182,7 +182,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		/* We requested READDIRPLUS, but the server doesn't grok it */
 		if (error == -ENOTSUPP && desc->plus) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_READDIRPLUS;
-			NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS;
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 			desc->plus = 0;
 			goto again;
 		}
@@ -545,7 +545,7 @@ static int nfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			break;
 		}
 		if (res == -ETOOSMALL && desc->plus) {
-			NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS;
+			clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 			nfs_zap_caches(inode);
 			desc->plus = 0;
 			desc->entry->eof = 0;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 622184553516..ee27578277f3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -739,7 +739,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
 			inode->i_fop = &nfs_dir_operations;
 			if (nfs_server_capable(inode, NFS_CAP_READDIRPLUS)
 			    && fattr->size <= NFS_LIMIT_READDIRPLUS)
-				NFS_FLAGS(inode) |= NFS_INO_ADVISE_RDPLUS;
+				set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 		} else if (S_ISLNK(inode->i_mode))
 			inode->i_op = &nfs_symlink_inode_operations;
 		else
@@ -849,26 +849,43 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 	}
 }
 
+static int nfs_wait_schedule(void *word)
+{
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	schedule();
+	return 0;
+}
+
 /*
  * Wait for the inode to get unlocked.
- * (Used for NFS_INO_LOCKED and NFS_INO_REVALIDATING).
  */
-static int
-nfs_wait_on_inode(struct inode *inode, int flag)
+static int nfs_wait_on_inode(struct inode *inode)
 {
 	struct rpc_clnt	*clnt = NFS_CLIENT(inode);
 	struct nfs_inode *nfsi = NFS_I(inode);
-
+	sigset_t oldmask;
 	int error;
-	if (!(NFS_FLAGS(inode) & flag))
-		return 0;
+
 	atomic_inc(&inode->i_count);
-	error = nfs_wait_event(clnt, nfsi->nfs_i_wait,
-				!(NFS_FLAGS(inode) & flag));
+	rpc_clnt_sigmask(clnt, &oldmask);
+	error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING,
+					nfs_wait_schedule, TASK_INTERRUPTIBLE);
+	rpc_clnt_sigunmask(clnt, &oldmask);
 	iput(inode);
+
 	return error;
 }
 
+static void nfs_wake_up_inode(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	clear_bit(NFS_INO_REVALIDATING, &nfsi->flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING);
+}
+
 int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1029,18 +1046,19 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	if (NFS_STALE(inode))
  		goto out_nowait;
 
-	while (NFS_REVALIDATING(inode)) {
-		status = nfs_wait_on_inode(inode, NFS_INO_REVALIDATING);
-		if (status < 0)
-			goto out_nowait;
-		if (NFS_ATTRTIMEO(inode) == 0)
-			continue;
-		if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME))
-			continue;
-		status = NFS_STALE(inode) ? -ESTALE : 0;
-		goto out_nowait;
+	status = nfs_wait_on_inode(inode);
+	if (status < 0)
+		goto out;
+	if (NFS_STALE(inode)) {
+		status = -ESTALE;
+		/* Do we trust the cached ESTALE? */
+		if (NFS_ATTRTIMEO(inode) != 0) {
+			if (nfsi->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ATIME)) {
+				/* no */
+			} else
+				goto out;
+		}
 	}
-	NFS_FLAGS(inode) |= NFS_INO_REVALIDATING;
 
 	/* Protect against RPC races by saving the change attribute */
 	verifier = nfs_save_change_attribute(inode);
@@ -1052,7 +1070,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		if (status == -ESTALE) {
 			nfs_zap_caches(inode);
 			if (!S_ISDIR(inode->i_mode))
-				NFS_FLAGS(inode) |= NFS_INO_STALE;
+				set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
 		}
 		goto out;
 	}
@@ -1083,9 +1101,9 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 		inode->i_sb->s_id,
 		(long long)NFS_FILEID(inode));
 
-out:
-	NFS_FLAGS(inode) &= ~NFS_INO_REVALIDATING;
-	wake_up(&nfsi->nfs_i_wait);
+ out:
+	nfs_wake_up_inode(inode);
+
  out_nowait:
 	unlock_kernel();
 	return status;
@@ -1404,7 +1422,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 	 */
 	nfs_invalidate_inode(inode);
  out_err:
-	NFS_FLAGS(inode) |= NFS_INO_STALE;
+	set_bit(NFS_INO_STALE, &NFS_FLAGS(inode));
 	return -ESTALE;
 }
 
@@ -1996,7 +2014,6 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
 		nfsi->ndirty = 0;
 		nfsi->ncommit = 0;
 		nfsi->npages = 0;
-		init_waitqueue_head(&nfsi->nfs_i_wait);
 		nfs4_init_once(nfsi);
 	}
 }
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 229a1755842a..deef9567788a 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -112,8 +112,8 @@ struct nfs_inode {
 	/*
 	 * Various flags
 	 */
-	unsigned int		flags;
-	unsigned long		cache_validity;
+	unsigned long		flags;			/* atomic bit ops */
+	unsigned long		cache_validity;		/* bit mask */
 
 	/*
 	 * read_cache_jiffies is when we started read-caching this inode,
@@ -175,8 +175,6 @@ struct nfs_inode {
 	/* Open contexts for shared mmap writes */
 	struct list_head	open_files;
 
-	wait_queue_head_t	nfs_i_wait;
-
 #ifdef CONFIG_NFS_V4
 	struct nfs4_cached_acl	*nfs4_acl;
         /* NFSv4 state */
@@ -199,11 +197,11 @@ struct nfs_inode {
 #define NFS_INO_REVAL_PAGECACHE	0x0020		/* must revalidate pagecache */
 
 /*
- * Legal values of flags field
+ * Bit offsets in flags field
  */
-#define NFS_INO_REVALIDATING	0x0001		/* revalidating attrs */
-#define NFS_INO_ADVISE_RDPLUS	0x0002		/* advise readdirplus */
-#define NFS_INO_STALE		0x0004		/* possible stale inode */
+#define NFS_INO_REVALIDATING	(0)		/* revalidating attrs */
+#define NFS_INO_ADVISE_RDPLUS	(1)		/* advise readdirplus */
+#define NFS_INO_STALE		(2)		/* possible stale inode */
 
 static inline struct nfs_inode *NFS_I(struct inode *inode)
 {
@@ -229,8 +227,7 @@ static inline struct nfs_inode *NFS_I(struct inode *inode)
 #define NFS_ATTRTIMEO_UPDATE(inode)	(NFS_I(inode)->attrtimeo_timestamp)
 
 #define NFS_FLAGS(inode)		(NFS_I(inode)->flags)
-#define NFS_REVALIDATING(inode)		(NFS_FLAGS(inode) & NFS_INO_REVALIDATING)
-#define NFS_STALE(inode)		(NFS_FLAGS(inode) & NFS_INO_STALE)
+#define NFS_STALE(inode)		(test_bit(NFS_INO_STALE, &NFS_FLAGS(inode)))
 
 #define NFS_FILEID(inode)		(NFS_I(inode)->fileid)
 
@@ -252,7 +249,7 @@ static inline int nfs_server_capable(struct inode *inode, int cap)
 
 static inline int NFS_USE_READDIRPLUS(struct inode *inode)
 {
-	return NFS_FLAGS(inode) & NFS_INO_ADVISE_RDPLUS;
+	return test_bit(NFS_INO_ADVISE_RDPLUS, &NFS_FLAGS(inode));
 }
 
 /**
-- 
cgit v1.2.3-59-g8ed1b


From dc59250c6ebed099a9bc0a11298e2281dd896657 Mon Sep 17 00:00:00 2001
From: Chuck Lever <cel@citi.umich.edu>
Date: Thu, 18 Aug 2005 11:24:12 -0700
Subject: [PATCH] NFS: Introduce the use of inode->i_lock to protect fields in
 nfsi

Down the road we want to eliminate the use of the global kernel lock entirely
from the NFS client.  To do this, we need to protect the fields in the
nfs_inode structure adequately.  Start by serializing updates to the
"cache_validity" field.

Note this change addresses an SMP hang found by njw@osdl.org, where processes
deadlock because nfs_end_data_update and nfs_revalidate_mapping update the
"cache_validity" field without proper serialization.

Test plan:
 Millions of fsx ops on SMP clients.  Run Nick Wilson's breaknfs program on
 large SMP clients.

Signed-off-by: Chuck Lever <cel@netapp.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/nfs/dir.c           |  7 +++++++
 fs/nfs/inode.c         | 34 +++++++++++++++++++++++++++++++---
 fs/nfs/nfs3acl.c       |  2 ++
 fs/nfs/read.c          |  4 ++++
 include/linux/nfs_fs.h |  5 ++++-
 5 files changed, 48 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 27cf5577f239..147cbf9261ce 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -189,7 +189,9 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 		goto error;
 	}
 	SetPageUptodate(page);
+	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
 	/* Ensure consistent page alignment of the data.
 	 * Note: assumes we have exclusive access to this mapping either
 	 *	 through inode->i_sem or some other mechanism.
@@ -462,7 +464,9 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent,
 						page,
 						NFS_SERVER(inode)->dtsize,
 						desc->plus);
+	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
 	desc->page = page;
 	desc->ptr = kmap(page);		/* matching kunmap in nfs_do_filldir */
 	if (desc->error >= 0) {
@@ -1596,7 +1600,10 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 			put_rpccred(cache->cred);
 		cache->cred = get_rpccred(set->cred);
 	}
+	/* FIXME: replace current access_cache BKL reliance with inode->i_lock */
+	spin_lock(&inode->i_lock);
 	nfsi->cache_validity &= ~NFS_INO_INVALID_ACCESS;
+	spin_unlock(&inode->i_lock);
 	cache->jiffies = set->jiffies;
 	cache->mask = set->mask;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ee27578277f3..541b418327c8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -615,6 +615,8 @@ nfs_zap_caches(struct inode *inode)
 	struct nfs_inode *nfsi = NFS_I(inode);
 	int mode = inode->i_mode;
 
+	spin_lock(&inode->i_lock);
+
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
 
@@ -623,6 +625,8 @@ nfs_zap_caches(struct inode *inode)
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
 	else
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL|NFS_INO_REVAL_PAGECACHE;
+
+	spin_unlock(&inode->i_lock);
 }
 
 static void nfs_zap_acl_cache(struct inode *inode)
@@ -632,7 +636,9 @@ static void nfs_zap_acl_cache(struct inode *inode)
 	clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache;
 	if (clear_acl_cache != NULL)
 		clear_acl_cache(inode);
+	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL;
+	spin_unlock(&inode->i_lock);
 }
 
 /*
@@ -841,7 +847,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr)
 			inode->i_uid = attr->ia_uid;
 		if ((attr->ia_valid & ATTR_GID) != 0)
 			inode->i_gid = attr->ia_gid;
+		spin_lock(&inode->i_lock);
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+		spin_unlock(&inode->i_lock);
 	}
 	if ((attr->ia_valid & ATTR_SIZE) != 0) {
 		inode->i_size = attr->ia_size;
@@ -1082,6 +1090,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 			 (long long)NFS_FILEID(inode), status);
 		goto out;
 	}
+	spin_lock(&inode->i_lock);
 	cache_validity = nfsi->cache_validity;
 	nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
 
@@ -1091,6 +1100,7 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 	 */
 	if (verifier == nfsi->cache_change_attribute)
 		nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME);
+	spin_unlock(&inode->i_lock);
 
 	nfs_revalidate_mapping(inode, inode->i_mapping);
 
@@ -1149,12 +1159,16 @@ void nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 			nfs_wb_all(inode);
 		}
 		invalidate_inode_pages2(mapping);
+
+		spin_lock(&inode->i_lock);
 		nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
 		if (S_ISDIR(inode->i_mode)) {
 			memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
 			/* This ensures we revalidate child dentries */
 			nfsi->cache_change_attribute++;
 		}
+		spin_unlock(&inode->i_lock);
+
 		dfprintk(PAGECACHE, "NFS: (%s/%Ld) data cache invalidated\n",
 				inode->i_sb->s_id,
 				(long long)NFS_FILEID(inode));
@@ -1184,10 +1198,12 @@ void nfs_end_data_update(struct inode *inode)
 
 	if (!nfs_have_delegation(inode, FMODE_READ)) {
 		/* Mark the attribute cache for revalidation */
+		spin_lock(&inode->i_lock);
 		nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
 		/* Directories and symlinks: invalidate page cache too */
 		if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
 			nfsi->cache_validity |= NFS_INO_INVALID_DATA;
+		spin_unlock(&inode->i_lock);
 	}
 	nfsi->cache_change_attribute ++;
 	atomic_dec(&nfsi->data_updates);
@@ -1212,6 +1228,8 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (nfs_have_delegation(inode, FMODE_READ))
 		return 0;
 
+	spin_lock(&inode->i_lock);
+
 	/* Are we in the process of updating data on the server? */
 	data_unstable = nfs_caches_unstable(inode);
 
@@ -1226,13 +1244,17 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 		}
 	}
 
-	if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+	if ((fattr->valid & NFS_ATTR_FATTR) == 0) {
+		spin_unlock(&inode->i_lock);
 		return 0;
+	}
 
 	/* Has the inode gone and changed behind our back? */
 	if (nfsi->fileid != fattr->fileid
-			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+			|| (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+		spin_unlock(&inode->i_lock);
 		return -EIO;
+	}
 
 	cur_size = i_size_read(inode);
  	new_isize = nfs_size_to_loff_t(fattr->size);
@@ -1271,6 +1293,7 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 		nfsi->cache_validity |= NFS_INO_INVALID_ATIME;
 
 	nfsi->read_cache_jiffies = fattr->timestamp;
+	spin_unlock(&inode->i_lock);
 	return 0;
 }
 
@@ -1309,11 +1332,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 		goto out_err;
 	}
 
+	spin_lock(&inode->i_lock);
+
 	/*
 	 * Make sure the inode's type hasn't changed.
 	 */
-	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+	if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+		spin_unlock(&inode->i_lock);
 		goto out_changed;
+	}
 
 	/*
 	 * Update the read time so we don't revalidate too often.
@@ -1406,6 +1433,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr, unsign
 	if (!nfs_have_delegation(inode, FMODE_READ))
 		nfsi->cache_validity |= invalid;
 
+	spin_unlock(&inode->i_lock);
 	return 0;
  out_changed:
 	/*
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index a020e650ffc2..6a5bbc0ae941 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -308,7 +308,9 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
 	nfs_begin_data_update(inode);
 	status = rpc_call(server->client_acl, ACLPROC3_SETACL,
 			  &args, &fattr, 0);
+	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS;
+	spin_unlock(&inode->i_lock);
 	nfs_end_data_update(inode);
 	dprintk("NFS reply setacl: %d\n", status);
 
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 90df0500ca1b..6ceb1d471f20 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -140,7 +140,9 @@ static int nfs_readpage_sync(struct nfs_open_context *ctx, struct inode *inode,
 		if (rdata->res.eof != 0 || result == 0)
 			break;
 	} while (count);
+	spin_lock(&inode->i_lock);
 	NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&inode->i_lock);
 
 	if (count)
 		memclear_highpage_flush(page, rdata->args.pgbase, count);
@@ -473,7 +475,9 @@ void nfs_readpage_result(struct rpc_task *task)
 		}
 		task->tk_status = -EIO;
 	}
+	spin_lock(&data->inode->i_lock);
 	NFS_I(data->inode)->cache_validity |= NFS_INO_INVALID_ATIME;
+	spin_unlock(&data->inode->i_lock);
 	data->complete(data, status);
 }
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index deef9567788a..9a6047ff1b25 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -238,8 +238,11 @@ static inline int nfs_caches_unstable(struct inode *inode)
 
 static inline void NFS_CACHEINV(struct inode *inode)
 {
-	if (!nfs_caches_unstable(inode))
+	if (!nfs_caches_unstable(inode)) {
+		spin_lock(&inode->i_lock);
 		NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS;
+		spin_unlock(&inode->i_lock);
+	}
 }
 
 static inline int nfs_server_capable(struct inode *inode, int cap)
-- 
cgit v1.2.3-59-g8ed1b


From b07e5eccaf512ae3209beae5cd2e3a27c92c300b Mon Sep 17 00:00:00 2001
From: Grant Coady <gcoady@gmail.com>
Date: Thu, 18 Aug 2005 22:19:55 +0200
Subject: [PATCH] ide: fix PCI_DEVIEC_ID_APPLE_UNI_N_ATA spelling

Signed-off-by: Grant Coady <gcoady@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@elka.pw.edu.pl>
---
 drivers/ide/ppc/pmac.c  | 2 +-
 include/linux/pci_ids.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index be0fcc8f4b15..ea65b070a367 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1664,7 +1664,7 @@ static struct macio_driver pmac_ide_macio_driver =
 };
 
 static struct pci_device_id pmac_ide_pci_match[] = {
-	{ PCI_VENDOR_ID_APPLE, PCI_DEVIEC_ID_APPLE_UNI_N_ATA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
+	{ PCI_VENDOR_ID_APPLE, PCI_DEVICE_ID_APPLE_UNI_N_ATA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ PCI_VENDOR_ID_APPLE, PCI_DEVICE_ID_APPLE_IPID_ATA100, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ PCI_VENDOR_ID_APPLE, PCI_DEVICE_ID_APPLE_K2_ATA100, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0},
 	{ PCI_VENDOR_ID_APPLE, PCI_DEVICE_ID_APPLE_SH_ATA,
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 51e61e96051c..953b4dc819ae 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -881,7 +881,7 @@
 #define PCI_DEVICE_ID_APPLE_UNI_N_PCI15	0x002e
 #define PCI_DEVICE_ID_APPLE_UNI_N_FW2	0x0030
 #define PCI_DEVICE_ID_APPLE_UNI_N_GMAC2	0x0032
-#define PCI_DEVIEC_ID_APPLE_UNI_N_ATA	0x0033
+#define PCI_DEVICE_ID_APPLE_UNI_N_ATA	0x0033
 #define PCI_DEVICE_ID_APPLE_UNI_N_AGP2	0x0034
 #define PCI_DEVICE_ID_APPLE_IPID_ATA100	0x003b
 #define PCI_DEVICE_ID_APPLE_KEYLARGO_I	0x003e
-- 
cgit v1.2.3-59-g8ed1b


From 2f09a7f4af131bf23c013ead89373deba1c7593c Mon Sep 17 00:00:00 2001
From: Matt Gillette <matt.gillette@netcell.com>
Date: Thu, 18 Aug 2005 22:27:07 +0200
Subject: [PATCH] ide: add support for Netcell Revolution to pci-ide generic
 driver

Adds support for Netcell Revolution to pci-ide generic driver by including
it in the list of devices matched.  Includes the Revolution in the list of
simplex devices forced into DMA mode.

Signed-off-by: Matt Gillette <matt.gillette@netcell.com>
Cc: Bartlomiej Zolnierkiewicz <B.Zolnierkiewicz@elka.pw.edu.pl>
Cc: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@elka.pw.edu.pl>
---
 drivers/ide/pci/generic.c | 7 +++++++
 drivers/ide/setup-pci.c   | 1 +
 include/linux/pci_ids.h   | 3 +++
 3 files changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/generic.c b/drivers/ide/pci/generic.c
index da46577380f3..6e3ab0c38c4d 100644
--- a/drivers/ide/pci/generic.c
+++ b/drivers/ide/pci/generic.c
@@ -173,6 +173,12 @@ static ide_pci_device_t generic_chipsets[] __devinitdata = {
 		.channels	= 2,
 		.autodma	= NOAUTODMA,
 		.bootable	= ON_BOARD,
+	},{	/* 14 */
+		.name		= "Revolution",
+		.init_hwif	= init_hwif_generic,
+		.channels	= 2,
+		.autodma	= AUTODMA,
+		.bootable	= OFF_BOARD,
 	}
 };
 
@@ -231,6 +237,7 @@ static struct pci_device_id generic_pci_tbl[] = {
 	{ PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO,     PCI_ANY_ID, PCI_ANY_ID, 0, 0, 11},
 	{ PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_1,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 12},
 	{ PCI_VENDOR_ID_TOSHIBA,PCI_DEVICE_ID_TOSHIBA_PICCOLO_2,   PCI_ANY_ID, PCI_ANY_ID, 0, 0, 13},
+	{ PCI_VENDOR_ID_NETCELL,PCI_DEVICE_ID_REVOLUTION,          PCI_ANY_ID, PCI_ANY_ID, 0, 0, 14},
 	/* Must come last. If you add entries adjust this table appropriately and the init_one code */
 	{ PCI_ANY_ID,		PCI_ANY_ID,			   PCI_ANY_ID, PCI_ANY_ID, PCI_CLASS_STORAGE_IDE << 8, 0xFFFFFF00UL, 0},
 	{ 0, },
diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c
index 77da827b2898..18ed7765417c 100644
--- a/drivers/ide/setup-pci.c
+++ b/drivers/ide/setup-pci.c
@@ -229,6 +229,7 @@ second_chance_to_dma:
 			case PCI_DEVICE_ID_AMD_VIPER_7409:
 			case PCI_DEVICE_ID_CMD_643:
 			case PCI_DEVICE_ID_SERVERWORKS_CSB5IDE:
+			case PCI_DEVICE_ID_REVOLUTION:
 				simplex_stat = hwif->INB(dma_base + 2);
 				hwif->OUTB((simplex_stat&0x60),(dma_base + 2));
 				simplex_stat = hwif->INB(dma_base + 2);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 953b4dc819ae..163036867ac7 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2184,6 +2184,9 @@
 #define PCI_VENDOR_ID_SIBYTE		0x166d
 #define PCI_DEVICE_ID_BCM1250_HT	0x0002
 
+#define PCI_VENDOR_ID_NETCELL		0x169c
+#define PCI_DEVICE_ID_REVOLUTION	0x0044
+
 #define PCI_VENDOR_ID_LINKSYS		0x1737
 #define PCI_DEVICE_ID_LINKSYS_EG1032	0x1032
 #define PCI_DEVICE_ID_LINKSYS_EG1064	0x1064
-- 
cgit v1.2.3-59-g8ed1b


From 84f57fbc724e3b56dc87c37dddac89f82cf75ef6 Mon Sep 17 00:00:00 2001
From: Narendra Sankar <nsankar@broadcom.com>
Date: Thu, 18 Aug 2005 22:30:35 +0200
Subject: [PATCH] serverworks: add support for new southbridge IDE

BCM5785 (HT1000) is a Opteron Southbridge from Serverworks/Broadcom that
incorporates a single channel ATA100 IDE controller that is functionally
identical to the Serverworks CSB6 IDE controller.  This patch adds support
for the new PCI device ID and also the support for this controller.

Signed-off-by: Narendra Sankar <nsankar@broadcom.com>
Acked-by: Jeff Garzik <jgarzik@pobox.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@elka.pw.edu.pl>
---
 drivers/ide/pci/serverworks.c | 23 +++++++++++++++++++++++
 include/linux/pci_ids.h       |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ide/pci/serverworks.c b/drivers/ide/pci/serverworks.c
index c6f5fa4b4ca6..ff2e217a8c84 100644
--- a/drivers/ide/pci/serverworks.c
+++ b/drivers/ide/pci/serverworks.c
@@ -21,6 +21,9 @@
  *
  *   CSB6: `Champion South Bridge' IDE Interface (optional: third channel)
  *
+ *   HT1000: AKA BCM5785 - Hypertransport Southbridge for Opteron systems. IDE
+ *   controller same as the CSB6. Single channel ATA100 only.
+ *
  * Documentation:
  *	Available under NDA only. Errata info very hard to get.
  *
@@ -71,6 +74,8 @@ static u8 svwks_ratemask (ide_drive_t *drive)
 	if (!svwks_revision)
 		pci_read_config_byte(dev, PCI_REVISION_ID, &svwks_revision);
 
+	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_HT1000IDE)
+		return 2;
 	if (dev->device == PCI_DEVICE_ID_SERVERWORKS_OSB4IDE) {
 		u32 reg = 0;
 		if (isa_dev)
@@ -109,6 +114,7 @@ static u8 svwks_csb_check (struct pci_dev *dev)
 		case PCI_DEVICE_ID_SERVERWORKS_CSB5IDE:
 		case PCI_DEVICE_ID_SERVERWORKS_CSB6IDE:
 		case PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2:
+		case PCI_DEVICE_ID_SERVERWORKS_HT1000IDE:
 			return 1;
 		default:
 			break;
@@ -438,6 +444,13 @@ static unsigned int __devinit init_chipset_svwks (struct pci_dev *dev, const cha
 			btr |= (svwks_revision >= SVWKS_CSB5_REVISION_NEW) ? 0x3 : 0x2;
 		pci_write_config_byte(dev, 0x5A, btr);
 	}
+	/* Setup HT1000 SouthBridge Controller - Single Channel Only */
+	else if (dev->device == PCI_DEVICE_ID_SERVERWORKS_HT1000IDE) {
+		pci_read_config_byte(dev, 0x5A, &btr);
+		btr &= ~0x40;
+		btr |= 0x3;
+		pci_write_config_byte(dev, 0x5A, btr);
+	}
 
 	return (dev->irq) ? dev->irq : 0;
 }
@@ -629,6 +642,15 @@ static ide_pci_device_t serverworks_chipsets[] __devinitdata = {
 		.channels	= 1,	/* 2 */
 		.autodma	= AUTODMA,
 		.bootable	= ON_BOARD,
+	},{	/* 4 */
+		.name		= "SvrWks HT1000",
+		.init_setup	= init_setup_svwks,
+		.init_chipset	= init_chipset_svwks,
+		.init_hwif	= init_hwif_svwks,
+		.init_dma	= init_dma_svwks,
+		.channels	= 1,	/* 2 */
+		.autodma	= AUTODMA,
+		.bootable	= ON_BOARD,
 	}
 };
 
@@ -653,6 +675,7 @@ static struct pci_device_id svwks_pci_tbl[] = {
 	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB5IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1},
 	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2},
 	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3},
+	{ PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1000IDE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4},
 	{ 0, },
 };
 MODULE_DEVICE_TABLE(pci, svwks_pci_tbl);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 163036867ac7..927ed487630d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1580,6 +1580,7 @@
 #define PCI_DEVICE_ID_SERVERWORKS_OSB4IDE 0x0211
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5IDE 0x0212
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6IDE 0x0213
+#define PCI_DEVICE_ID_SERVERWORKS_HT1000IDE 0x0214
 #define PCI_DEVICE_ID_SERVERWORKS_CSB6IDE2 0x0217
 #define PCI_DEVICE_ID_SERVERWORKS_OSB4USB 0x0220
 #define PCI_DEVICE_ID_SERVERWORKS_CSB5USB PCI_DEVICE_ID_SERVERWORKS_OSB4USB
-- 
cgit v1.2.3-59-g8ed1b


From cc314eef0128a807e50fa03baf2d0abc0647952c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 19 Aug 2005 18:02:56 -0700
Subject: Fix nasty ncpfs symlink handling bug.

This bug could cause oopses and page state corruption, because ncpfs
used the generic page-cache symlink handlign functions.  But those
functions only work if the page cache is guaranteed to be "stable", ie a
page that was installed when the symlink walk was started has to still
be installed in the page cache at the end of the walk.

We could have fixed ncpfs to not use the generic helper routines, but it
is in many ways much cleaner to instead improve on the symlink walking
helper routines so that they don't require that absolute stability.

We do this by allowing "follow_link()" to return a error-pointer as a
cookie, which is fed back to the cleanup "put_link()" routine.  This
also simplifies NFS symlink handling.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/autofs/symlink.c |  5 +++--
 fs/cifs/cifsfs.h    |  4 ++--
 fs/cifs/link.c      |  6 +++---
 fs/ext2/symlink.c   |  4 ++--
 fs/ext3/symlink.c   |  4 ++--
 fs/namei.c          | 40 +++++++++++++++++++++-------------------
 fs/nfs/symlink.c    | 37 ++++++++-----------------------------
 fs/sysfs/symlink.c  |  6 +++---
 include/linux/fs.h  |  8 ++++----
 mm/shmem.c          | 17 ++++++-----------
 10 files changed, 54 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c
index f028396f1383..52e8772b066e 100644
--- a/fs/autofs/symlink.c
+++ b/fs/autofs/symlink.c
@@ -12,11 +12,12 @@
 
 #include "autofs_i.h"
 
-static int autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+/* Nothing to release.. */
+static void *autofs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	char *s=((struct autofs_symlink *)dentry->d_inode->u.generic_ip)->data;
 	nd_set_link(nd, s);
-	return 0;
+	return NULL;
 }
 
 struct inode_operations autofs_symlink_inode_operations = {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78af5850c558..1fd21f66f243 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -83,8 +83,8 @@ extern int cifs_dir_notify(struct file *, unsigned long arg);
 extern struct dentry_operations cifs_dentry_ops;
 
 /* Functions related to symlinks */
-extern int cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
-extern void cifs_put_link(struct dentry *direntry, struct nameidata *nd);
+extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd);
+extern void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *);
 extern int cifs_readlink(struct dentry *direntry, char __user *buffer, 
 			 int buflen);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index bde0fabfece0..ab925ef4f863 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -92,7 +92,7 @@ cifs_hl_exit:
 	return rc;
 }
 
-int
+void *
 cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 {
 	struct inode *inode = direntry->d_inode;
@@ -148,7 +148,7 @@ out:
 out_no_free:
 	FreeXid(xid);
 	nd_set_link(nd, target_path);
-	return 0;
+	return NULL;	/* No cookie */
 }
 
 int
@@ -330,7 +330,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen)
 	return rc;
 }
 
-void cifs_put_link(struct dentry *direntry, struct nameidata *nd)
+void cifs_put_link(struct dentry *direntry, struct nameidata *nd, void *cookie)
 {
 	char *p = nd_get_link(nd);
 	if (!IS_ERR(p))
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 9f7bac01d557..1e67d87cfa91 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -21,11 +21,11 @@
 #include "xattr.h"
 #include <linux/namei.h>
 
-static int ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext2_inode_info *ei = EXT2_I(dentry->d_inode);
 	nd_set_link(nd, (char *)ei->i_data);
-	return 0;
+	return NULL;
 }
 
 struct inode_operations ext2_symlink_inode_operations = {
diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c
index 8c3e72818fb0..4f79122cde67 100644
--- a/fs/ext3/symlink.c
+++ b/fs/ext3/symlink.c
@@ -23,11 +23,11 @@
 #include <linux/namei.h>
 #include "xattr.h"
 
-static int ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct ext3_inode_info *ei = EXT3_I(dentry->d_inode);
 	nd_set_link(nd, (char*)ei->i_data);
-	return 0;
+	return NULL;
 }
 
 struct inode_operations ext3_symlink_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index b85f158aef0c..6ec1f0fefc5b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -501,6 +501,7 @@ struct path {
 static inline int __do_follow_link(struct path *path, struct nameidata *nd)
 {
 	int error;
+	void *cookie;
 	struct dentry *dentry = path->dentry;
 
 	touch_atime(path->mnt, dentry);
@@ -508,13 +509,15 @@ static inline int __do_follow_link(struct path *path, struct nameidata *nd)
 
 	if (path->mnt == nd->mnt)
 		mntget(path->mnt);
-	error = dentry->d_inode->i_op->follow_link(dentry, nd);
-	if (!error) {
+	cookie = dentry->d_inode->i_op->follow_link(dentry, nd);
+	error = PTR_ERR(cookie);
+	if (!IS_ERR(cookie)) {
 		char *s = nd_get_link(nd);
+		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
 		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, nd);
+			dentry->d_inode->i_op->put_link(dentry, nd, cookie);
 	}
 	dput(dentry);
 	mntput(path->mnt);
@@ -2344,15 +2347,17 @@ out:
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
 	struct nameidata nd;
-	int res;
+	void *cookie;
+
 	nd.depth = 0;
-	res = dentry->d_inode->i_op->follow_link(dentry, &nd);
-	if (!res) {
-		res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
+	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
+	if (!IS_ERR(cookie)) {
+		int res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
 		if (dentry->d_inode->i_op->put_link)
-			dentry->d_inode->i_op->put_link(dentry, &nd);
+			dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
+		cookie = ERR_PTR(res);
 	}
-	return res;
+	return PTR_ERR(cookie);
 }
 
 int vfs_follow_link(struct nameidata *nd, const char *link)
@@ -2395,23 +2400,20 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	return res;
 }
 
-int page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
+void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd)
 {
-	struct page *page;
+	struct page *page = NULL;
 	nd_set_link(nd, page_getlink(dentry, &page));
-	return 0;
+	return page;
 }
 
-void page_put_link(struct dentry *dentry, struct nameidata *nd)
+void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
-	if (!IS_ERR(nd_get_link(nd))) {
-		struct page *page;
-		page = find_get_page(dentry->d_inode->i_mapping, 0);
-		if (!page)
-			BUG();
+	struct page *page = cookie;
+
+	if (page) {
 		kunmap(page);
 		page_cache_release(page);
-		page_cache_release(page);
 	}
 }
 
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 35f106599144..18dc95b0b646 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -27,26 +27,14 @@
 
 /* Symlink caching in the page cache is even more simplistic
  * and straight-forward than readdir caching.
- *
- * At the beginning of the page we store pointer to struct page in question,
- * simplifying nfs_put_link() (if inode got invalidated we can't find the page
- * to be freed via pagecache lookup).
- * The NUL-terminated string follows immediately thereafter.
  */
 
-struct nfs_symlink {
-	struct page *page;
-	char body[0];
-};
-
 static int nfs_symlink_filler(struct inode *inode, struct page *page)
 {
-	const unsigned int pgbase = offsetof(struct nfs_symlink, body);
-	const unsigned int pglen = PAGE_SIZE - pgbase;
 	int error;
 
 	lock_kernel();
-	error = NFS_PROTO(inode)->readlink(inode, page, pgbase, pglen);
+	error = NFS_PROTO(inode)->readlink(inode, page, 0, PAGE_SIZE);
 	unlock_kernel();
 	if (error < 0)
 		goto error;
@@ -60,11 +48,10 @@ error:
 	return -EIO;
 }
 
-static int nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct page *page;
-	struct nfs_symlink *p;
 	void *err = ERR_PTR(nfs_revalidate_inode(NFS_SERVER(inode), inode));
 	if (err)
 		goto read_failed;
@@ -78,28 +65,20 @@ static int nfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		err = ERR_PTR(-EIO);
 		goto getlink_read_error;
 	}
-	p = kmap(page);
-	p->page = page;
-	nd_set_link(nd, p->body);
-	return 0;
+	nd_set_link(nd, kmap(page));
+	return page;
 
 getlink_read_error:
 	page_cache_release(page);
 read_failed:
 	nd_set_link(nd, err);
-	return 0;
+	return NULL;
 }
 
-static void nfs_put_link(struct dentry *dentry, struct nameidata *nd)
+static void nfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
-	char *s = nd_get_link(nd);
-	if (!IS_ERR(s)) {
-		struct nfs_symlink *p;
-		struct page *page;
-
-		p = container_of(s, struct nfs_symlink, body[0]);
-		page = p->page;
-
+	if (cookie) {
+		struct page *page = cookie;
 		kunmap(page);
 		page_cache_release(page);
 	}
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index fae57c83a722..de402fa915f2 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -151,17 +151,17 @@ static int sysfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static int sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	int error = -ENOMEM;
 	unsigned long page = get_zeroed_page(GFP_KERNEL);
 	if (page)
 		error = sysfs_getlink(dentry, (char *) page); 
 	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return 0;
+	return NULL;
 }
 
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd)
+static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
 	char *page = nd_get_link(nd);
 	if (!IS_ERR(page))
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9adf75fd9b4..67e6732d4fdc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -993,8 +993,8 @@ struct inode_operations {
 	int (*rename) (struct inode *, struct dentry *,
 			struct inode *, struct dentry *);
 	int (*readlink) (struct dentry *, char __user *,int);
-	int (*follow_link) (struct dentry *, struct nameidata *);
-	void (*put_link) (struct dentry *, struct nameidata *);
+	void * (*follow_link) (struct dentry *, struct nameidata *);
+	void (*put_link) (struct dentry *, struct nameidata *, void *);
 	void (*truncate) (struct inode *);
 	int (*permission) (struct inode *, int, struct nameidata *);
 	int (*setattr) (struct dentry *, struct iattr *);
@@ -1602,8 +1602,8 @@ extern struct file_operations generic_ro_fops;
 extern int vfs_readlink(struct dentry *, char __user *, int, const char *);
 extern int vfs_follow_link(struct nameidata *, const char *);
 extern int page_readlink(struct dentry *, char __user *, int);
-extern int page_follow_link_light(struct dentry *, struct nameidata *);
-extern void page_put_link(struct dentry *, struct nameidata *);
+extern void *page_follow_link_light(struct dentry *, struct nameidata *);
+extern void page_put_link(struct dentry *, struct nameidata *, void *);
 extern int page_symlink(struct inode *inode, const char *symname, int len);
 extern struct inode_operations page_symlink_inode_operations;
 extern int generic_readlink(struct dentry *, char __user *, int);
diff --git a/mm/shmem.c b/mm/shmem.c
index e64fa726a790..5a81b1ee4f7a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1773,32 +1773,27 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 	return 0;
 }
 
-static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
 {
 	nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
-	return 0;
+	return NULL;
 }
 
-static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct page *page = NULL;
 	int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
 	nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
-	return 0;
+	return page;
 }
 
-static void shmem_put_link(struct dentry *dentry, struct nameidata *nd)
+static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
 {
 	if (!IS_ERR(nd_get_link(nd))) {
-		struct page *page;
-
-		page = find_get_page(dentry->d_inode->i_mapping, 0);
-		if (!page)
-			BUG();
+		struct page *page = cookie;
 		kunmap(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
-		page_cache_release(page);
 	}
 }
 
-- 
cgit v1.2.3-59-g8ed1b